# FEATURE ENGINEERING 
## Introduction
This notebook performs feature engineering for the REIS project. It prepares the cleaned dataset for modeling by creating useful features,encoding categorical data, removing unnecessary columns and saving as a model ready dataset

### 1. Import Library and Load Dataset

In [1]:
import pandas as pd 
import numpy as np
import json

In [None]:
df = pd.read_csv("cleaned_chennai_dataset.csv")

df.head()

Unnamed: 0,location,area_sqft,resale,no_of_bedrooms,nearby_school_1km,nearby_hospital_1km,nearby_mall_1km,nearby_gym_1km,swimmingpool,rainwaterharvesting,sportsfacility,powerbackup,carparking,childrens_playarea,liftavailable,price
0,perungalathur,1310,0,3,0,0,0,0,0,0,0,0,0,0,0,5500000.0
1,madhavaram,1126,0,2,0,0,0,1,1,1,0,1,0,1,1,5350000.0
2,karapakkam,1307,0,3,0,0,0,1,1,1,1,1,1,1,0,8205000.0
3,thiruvidandhai,3600,0,3,0,0,0,1,1,0,0,1,0,1,0,23400000.0
4,iyyappanthangal,1700,0,3,0,0,0,1,1,0,0,1,0,1,0,10100000.0


### 2. Create Engineered Features

In [None]:
df["log_price"] = np.log(df["price"])

In [None]:
locations_count = df.groupby("location").size().sort_values(ascending=False)

rare_locations = locations_count[locations_count < 20].index.tolist()

df["location_grouped"]=df["location"].apply(lambda x : "other" if x in rare_locations else x)

In [None]:
amenity_col = ["nearby_school_1km","nearby_hospital_1km","nearby_mall_1km","nearby_gym_1km","swimmingpool",
               "rainwaterharvesting","sportsfacility","powerbackup","carparking","childrens_playarea","liftavailable"]

df["amenity_score"] = df[amenity_col].sum(axis=1)

In [6]:
df.head()

Unnamed: 0,location,area_sqft,resale,no_of_bedrooms,nearby_school_1km,nearby_hospital_1km,nearby_mall_1km,nearby_gym_1km,swimmingpool,rainwaterharvesting,sportsfacility,powerbackup,carparking,childrens_playarea,liftavailable,price,log_price,location_grouped,amenity_score
0,perungalathur,1310,0,3,0,0,0,0,0,0,0,0,0,0,0,5500000.0,15.520259,perungalathur,0
1,madhavaram,1126,0,2,0,0,0,1,1,1,0,1,0,1,1,5350000.0,15.492607,other,6
2,karapakkam,1307,0,3,0,0,0,1,1,1,1,1,1,1,0,8205000.0,15.920254,karapakkam,7
3,thiruvidandhai,3600,0,3,0,0,0,1,1,0,0,1,0,1,0,23400000.0,16.968247,other,4
4,iyyappanthangal,1700,0,3,0,0,0,1,1,0,0,1,0,1,0,10100000.0,16.128046,iyyappanthangal,4


### 3. Remove Redundant Columns

In [None]:
df = df.drop(columns=["location","nearby_school_1km","nearby_hospital_1km","nearby_mall_1km","nearby_gym_1km",
                      "swimmingpool","rainwaterharvesting","sportsfacility","powerbackup","carparking",
                      "childrens_playarea","liftavailable","price"])

In [None]:
ordered_columns = ["location_grouped","area_sqft","resale","no_of_bedrooms","amenity_score","log_price"]

df = df[ordered_columns]

df.head()

Unnamed: 0,location_grouped,area_sqft,resale,no_of_bedrooms,amenity_score,log_price
0,perungalathur,1310,0,3,0,15.520259
1,other,1126,0,2,6,15.492607
2,karapakkam,1307,0,3,7,15.920254
3,other,3600,0,3,4,16.968247
4,iyyappanthangal,1700,0,3,4,16.128046


### 4. Encode Categorical Columns

In [None]:
location_dummies = pd.get_dummies(df["location_grouped"],prefix="loc",dtype=int)

df_encoded = pd.concat([location_dummies, df.drop(columns=["location_grouped"])],axis=1)

df_encoded.head()

Unnamed: 0,loc_adyar,loc_alwarpet,loc_ambattur,loc_anna nagar,loc_avadi,loc_ayanambakkam,loc_chromepet,loc_egmore,loc_guduvancheri,loc_iyyappanthangal,...,loc_ullagaram,loc_vadapalani,loc_vandalur,loc_velachery,loc_velappanchavadi,area_sqft,resale,no_of_bedrooms,amenity_score,log_price
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1310,0,3,0,15.520259
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1126,0,2,6,15.492607
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1307,0,3,7,15.920254
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3600,0,3,4,16.968247
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1700,0,3,4,16.128046


### 5. Save the Model-Ready Dataset

In [10]:
df_encoded.to_csv("final_chennai_dataset.csv",index=False)

### 6. Export model inference artifacts

In [None]:
location_grouped_values = sorted(df["location_grouped"].unique().tolist())

with open("location_grouped_values.json", "w") as f:
    json.dump(location_grouped_values, f, indent=4)

In [None]:
final_feature_columns = df_encoded.drop(columns="log_price").columns.tolist()

with open("final_feature_columns.json", "w") as f:
    json.dump(final_feature_columns, f, indent=4)

## Summary 
- Created the target variable using a log transformation of price
- Created domain-informed features such as amenity_score and grouped rare locations
- Removed raw price, individual amenity columns, and other redundant features
- One-hot encoded categorical variables to make the data model-ready
- Saved the final dataset for the modeling stage
- Exported preprocessing artifacts for inference