In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv("../Dataset/gurgaon_properties_missing_value_imputation.csv")

In [None]:
df.shape

In [None]:
df.head()

Obervation
* price_per_sqft is highly related to price.
* we will ask user what is your prefered sector, number of bhk. we will nost gonna ask what is your expected price_per_sqrt.
* we will not ask what is the society you are looking if user knows he will go to any website or physically go to society and can check the price. it is also not logicla to ask.
* therefore, dropping these 2 columns

In [None]:
train_df = df.drop(columns=['society','price_per_sqft'])

## Luxury score

Approach
* luxury score is a continuous number and we don't what what a score represent. i.e, is 45 means it is at medium level or high level. So will categorize the luxury colum into low, medium, high level

In [None]:
sns.boxplot(x = df['luxury_score'])

In [None]:
print(df['luxury_score'].min())
print(df['luxury_score'].max())

In [None]:
def categorize_luxury(score):
    if 0 <= score < 50:
        return "Low"
    elif 50 <= score < 150:
        return "Medium"
    elif 150 <= score <= 175:
        return "High"
    else:
        return None

In [None]:
train_df['luxury_category'] = train_df['luxury_score'].apply(categorize_luxury)

In [None]:
train_df.head()

In [None]:
train_df.head()

## floor num

Approach
* Here also categorizing the floor number

In [None]:
sns.boxplot(x = df['floorNum'])

In [None]:
print(df['floorNum'].min())
print(df['floorNum'].max())

In [None]:
def categorize_floor(floor):
    if 0 <= floor <= 2:
        return "Low Floor"
    elif 3 <= floor <= 10:
        return "Mid Floor"
    elif 11 <= floor <= 51:
        return "High Floor"
    else:
        return None

In [None]:
train_df['floor_category'] = train_df['floorNum'].apply(categorize_floor)

In [None]:
train_df.drop(columns=['floorNum','luxury_score'],inplace=True)

In [None]:
train_df.head()

Storing the traing data. it will be helpful to create pipeline in streamlit

In [None]:
train_df.to_csv('../Dataset/gurgaon_properties_post_feature_selectionv2.csv', index=False)

* we will encode all the category into numbers because machine can't understand string.
* we will use ordinal encoding. tree based model will work because by seeing 0 it will not consider that it is less important and 1 is more important. It will put cuts on your dataset and divide into region, but in case of liner model this will not reliable becuase linear model will assume the importance based on the number like if it is 3 then it is more importance than 1 for each column, so, you should use One hot encoding.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Create a copy of the original data for label encoding
data_label_encoded = train_df.copy()

categorical_cols = train_df.select_dtypes(include=['object']).columns

# Apply label encoding to categorical columns
for col in categorical_cols:
    oe = OrdinalEncoder()
    data_label_encoded[col] = oe.fit_transform(data_label_encoded[[col]])
    print(oe.categories_)

# Splitting the dataset into training and testing sets
X_label = data_label_encoded.drop('price', axis=1)
y_label = data_label_encoded['price']

In [None]:
X_label.head()

In [None]:
X_label.shape

## Technique 1 - Correlation Analysis

In [None]:
 sns.heatmap(data_label_encoded.corr())

In [None]:
fi_df1 = data_label_encoded.corr()['price'].iloc[1:].to_frame().reset_index().rename(columns={'index':'feature','price':'corr_coeff'})
fi_df1

## Technique 2 - Random Forest Feature Importance

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(X_label, y_label)

fi_df2 = pd.DataFrame({
    'feature': X_label.columns,
    'rf_importance': rf_label.feature_importances_
}).sort_values(by='rf_importance', ascending=False)

fi_df2

## Technique 3 - Gradient Boosting Feature importances


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train a Random Forest regressor on label encoded data
gb_label = GradientBoostingRegressor()
gb_label.fit(X_label, y_label)

# Extract feature importance scores for label encoded data
fi_df3 = pd.DataFrame({
    'feature': X_label.columns,
    'gb_importance': gb_label.feature_importances_
}).sort_values(by='gb_importance', ascending=False)

fi_df3

## Technique 4 - Permutation Importance

You jumble each column value and do the predcition, if the prediction goes down then the column has more importance if it remains unchanged then it has less importance.

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(X_label, y_label, test_size=0.2, random_state=42)

# Train a Random Forest regressor on label encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(X_train_label, y_train_label)

# Calculate Permutation Importance
perm_importance = permutation_importance(rf_label, X_test_label, y_test_label, n_repeats=30, random_state=42)

# Organize results into a DataFrame
fi_df4 = pd.DataFrame({
    'feature': X_label.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by='permutation_importance', ascending=False)

fi_df4

## Technique 5 - LASSO

Will use linear model just to show that after Ordinal Encoding it is not that reliable.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_label)

# Train a LASSO regression model
# We'll use a relatively small value for alpha (the regularization strength) for demonstration purposes
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(X_scaled, y_label)

# Extract coefficients
fi_df5 = pd.DataFrame({
    'feature': X_label.columns,
    'lasso_coeff': lasso.coef_
}).sort_values(by='lasso_coeff', ascending=False)

fi_df5

## Technique 6 - RFE

In [None]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=X_label.shape[1], step=1)
selector_label = selector_label.fit(X_label, y_label)

# Get the selected features based on RFE
selected_features = X_label.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying linear regression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fi_df6 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

fi_df6

## Technique 7 - Linear Regression Weights

In [None]:
# Train a linear regression model on the label-encoded and standardized training data
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_scaled, y_label)

# Extract coefficients
fi_df7 = pd.DataFrame({
    'feature': X_label.columns,
    'reg_coeffs': lin_reg.coef_
}).sort_values(by='reg_coeffs', ascending=False)

fi_df7

## Technique 8 - SHAP

In [None]:
import shap

# Compute SHAP values using the trained Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_label, y_label)

explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_label)

# Summing the absolute SHAP values across all samples to get an overall measure of feature importance
shap_sum = np.abs(shap_values).mean(axis=0)

shap_values

In [None]:
fi_df8 = pd.DataFrame({
    'feature': X_label.columns,
    'SHAP_score': np.abs(shap_values).mean(axis=0)
}).sort_values(by='SHAP_score', ascending=False)

fi_df8

In [None]:
final_fi_df = fi_df1.merge(fi_df2,on='feature').merge(fi_df3,on='feature').merge(fi_df4,on='feature').merge(fi_df5,on='feature').merge(fi_df6,on='feature').merge(fi_df7,on='feature').merge(fi_df8,on='feature').set_index('feature')


In [None]:
final_fi_df

In [None]:
# normalize the score
final_fi_df = final_fi_df.divide(final_fi_df.sum(axis=0), axis=1)

In [None]:
final_fi_df[['rf_importance','gb_importance','permutation_importance','rfe_score','SHAP_score']].mean(axis=1).sort_values(ascending=False)


* pooja room, study room, others has very less importance so drop it before droping do a verification if the prediction is changing or not.

In [None]:
# with all the cols
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label, y_label, cv=5, scoring='r2')

In [None]:
scores.mean()

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

scores = cross_val_score(rf, X_label.drop(columns=['pooja room', 'study room', 'others']), y_label, cv=5, scoring='r2')
scores.mean()

In [None]:
export_df = X_label.drop(columns=['pooja room', 'study room', 'others'])
export_df['price'] = y_label

In [None]:
export_df

In [None]:
export_df.to_csv('../Dataset/gurgaon_properties_post_feature_selection.csv', index=False)