In [518]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_row',None)
pd.set_option('display.max_columns',None)

## Load Flat Dataset

In [None]:
df_flat = pd.read_excel('flats.xlsx')
df_flat.head()

In [None]:
df_flat.shape

In [None]:
df_flat.info()

1. all most every column have  missing value
2. All columns have string data type

In [None]:
df_flat.duplicated().sum()

=> No duplicate entries found

In [None]:
df_flat.isnull().sum()

Lot of missing value in my data set

In [None]:
# Deleting unwanted columns
df_flat.drop(columns=['link','property_id'], axis=1, inplace=True)
df_flat.head()

In [None]:
# rename column "area" to "Price per sq feets"

df_flat.rename(columns={'area':'Price_Per_sqFeets'},inplace=True)
df_flat.head()

In [None]:
df_flat['society'].value_counts()

In [None]:
df_flat['society'].value_counts().shape

In [None]:
import re
df_flat['society'] = df_flat['society'].apply(lambda x: re.sub(r'\d+(\.\d+)?\s★', '', str(x)).strip()).str.lower()

In [None]:
df_flat['society'].value_counts().shape

In [None]:
df_flat['price'].value_counts()

In [None]:
df_flat= df_flat[df_flat['price'] != 'Price on Request']

In [None]:
df_flat.head()

In [None]:
df_flat['price'].value_counts()

In [None]:
# create a function for remove string from price column

def price_update(x):
    if type(x) == float:
        return x
    else:
        try:
            if x[1] == 'Lac':
                return round(float(x[0])/100,2)
            else:
                return round(float(x[0]),2)
        except:
            pass

In [None]:
df_flat['price'] = df_flat['price'].str.split(' ').apply(price_update)

In [None]:
df_flat.head()

In [None]:
df_flat['Price_Per_sqFeets']= df_flat['Price_Per_sqFeets'].str.split(' ').str.get(1).str.split('/').str.get(0).str.replace(',','').str.strip().astype(float)

In [None]:
df_flat.head()

In [None]:
df_flat['areaWithType'].value_counts()

In [None]:
#bedroom
df_flat['bedRoom'].value_counts()

In [None]:
df_flat[df_flat['bedRoom'].isnull()]

In [None]:
df_flat = df_flat[~df_flat['bedRoom'].isnull()]

In [None]:
df_flat

In [None]:
df_flat['bedRoom']= df_flat['bedRoom'].str.split(' ').str.get(0).str.strip().astype('int')

In [None]:
df_flat.head()

In [None]:
df_flat['bathroom'].value_counts()

In [None]:
df_flat['bathroom'].isnull().sum()

In [None]:
df_flat['bathroom']=df_flat['bathroom'].str.split(' ').str.get(0).astype('int')

In [None]:
df_flat['balcony'].value_counts()

In [None]:
df_flat['balcony'].isnull().sum()

In [None]:
df_flat['balcony'] = df_flat['balcony'].str.split(' ').str.get(0).str.replace('No','0')

In [None]:
df_flat.head()

In [None]:
df_flat['additionalRoom'].value_counts()

In [None]:
df_flat['additionalRoom'].isnull().sum()

In [None]:
df_flat['additionalRoom'].fillna('not available', inplace=True)

In [None]:
df_flat['additionalRoom'] = df_flat['additionalRoom'].str.lower()

In [None]:
df_flat.head()

In [None]:
# floor nu

df_flat['floorNum']

In [None]:
df_flat['floorNum'].isnull().sum()

In [None]:
df_flat['floorNum'] = df_flat['floorNum'].str.split(' ').str.get(0).str.replace('Ground','0').str.replace('Basement','-1').str.replace('Lower','0').str.extract(r'(\d+)')

In [None]:
df_flat['floorNum'] = df_flat['floorNum'].astype('float')

In [None]:
df_flat.head()

In [None]:
df_flat['facing'].value_counts()

In [None]:
df_flat['facing'].isnull().sum()

In [None]:
df_flat['facing'].fillna('NA',inplace=True)

In [None]:
df_flat.insert(loc=4,column='area', value=round((df_flat['price']*10000000)/df_flat['Price_Per_sqFeets']))

In [None]:
df_flat.insert(loc=1,column='Property_type',value='flat')

In [None]:
df_flat.head()

In [None]:
df_flat.info()

In [None]:
df_flat.shape

## Load House data set

In [None]:
# load dataset

df_house = pd.read_excel('independent-house.xlsx')
df_house.head()

In [None]:
df_house.drop(columns=['link','property_id'], axis=1, inplace=True)
df_house.head()

In [None]:
df_house.isnull().sum()

In [None]:
df_house.duplicated().sum()

In [None]:
df_house.drop_duplicates(inplace=True)

In [None]:
df_house.shape

In [None]:
df_house.info()

In [None]:
df_house.rename(columns={'rate':'Price_Per_sqFeets'},inplace=True)

In [None]:
df_house['society'].value_counts()

In [None]:
df_house['society'].value_counts().sum()

In [None]:
import re
df_house['society'] = df_house['society'].apply(lambda x: re.sub(r'\d+(\.\d+)?\s★', '', str(x)).strip()).str.lower()

In [None]:
df_house['society'].value_counts()

In [None]:
df_house['society'] = df_house['society'].str.replace('nan','independent')

In [None]:
def correct_price(x):
    if type(x) == float:
        return x
    else:
        if x[1] == 'Lac':
            return round(float(x[0])/100,2)
        else:
            return round(float(x[0]),2)

In [None]:
df_house['price'].value_counts()

In [None]:
df_house = df_house[df_house['price'] != 'Price on Request']

In [None]:
df_house.shape

In [None]:
df_house['price'] = df_house['price'].str.split().apply(correct_price)

In [None]:
df_house.info()

In [None]:
df_house['Price_Per_sqFeets'].value_counts()

In [None]:
df_house['Price_Per_sqFeets'].value_counts().sum()

In [None]:
df_house['Price_Per_sqFeets'] = df_house['Price_Per_sqFeets'].str.split(' ').str.get(1).str.split('/').str.get(0).str.replace(',','')

In [None]:
df_house['Price_Per_sqFeets'] = df_house['Price_Per_sqFeets'].astype('float')

In [None]:
df_house.info()

In [None]:
df_house.head()

In [None]:
df_house['area'].value_counts()

In [None]:
df_house['area'].value_counts().sum()

In [None]:
df_house['bedRoom'].value_counts()

In [None]:
df_house['bedRoom'].isnull().sum()

In [None]:
df_house = df_house[~df_house['bedRoom'].isnull()]

In [None]:
df_house.head()

In [None]:
df_house['bedRoom'] = df_house['bedRoom'].str.split(' ').str.get(0).astype('int')

In [None]:
df_house['bathroom'].isnull().sum()

In [None]:
df_house['bathroom'] = df_house['bathroom'].str.split(' ').str.get(0).astype('int')

In [None]:
df_house['balcony'].isnull().sum()

In [None]:
df_house['balcony'].value_counts()

In [None]:
df_house['balcony'] = df_house['balcony'].str.split(' ').str.get(0).str.replace('No','0')

In [None]:
df_house.sample(10)

In [None]:
df_house['additionalRoom'].value_counts()

In [None]:
df_house['additionalRoom'].isnull().sum()

In [None]:
df_house['additionalRoom'].fillna('Not available',inplace=True)

In [None]:
df_house['noOfFloor'].value_counts()

In [None]:
df_house['noOfFloor'].isnull().sum()

In [None]:
df_house['noOfFloor'] = df_house['noOfFloor'].str.split(' ').str.get(0)

In [None]:
df_house.rename(columns={'noOfFloor':'floorNum'},inplace=True)

In [None]:
df_house['floorNum'] = df_house['floorNum'].astype('float')

In [None]:
df_house['facing'].isnull().sum()

In [None]:
df_house['facing'].fillna('NA',inplace=True)

In [None]:
df_house['area'] = round((df_house['price']*10000000)/df_house['Price_Per_sqFeets'])

In [None]:
df_house.head()

In [None]:
df_house.insert(loc=1,column='Property_type', value='house')

In [None]:
df_house.shape

In [None]:
df_house.info()

### Concatenate House dataset and Flat dataset

In [None]:
df = pd.concat([df_flat,df_house],axis=0).reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df['property_name'].value_counts()

In [None]:
df.insert(loc=3,column='sector',value=df['property_name'].str.split('in').str.get(1).str.replace('Gurgaon',''))

In [None]:
df.head()

In [None]:
df['sector'] = df['sector'].str.lower()

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df['sector'].value_counts()

In [None]:
df['sector'].value_counts().sum()

In [None]:
df['sector'] = df['sector'].str.replace('sohna','sector 35')

In [None]:
df['sector'] = df['sector'].str.replace('nirvana country','sector 50')
df['sector'] = df['sector'].str.replace('sector-33 sector 35','sector 35')
df['sector'] = df['sector'].str.replace('palam vihar','sector 23')
df['sector'] = df['sector'].str.replace('dlf phase 1','sector 26')
df['sector'] = df['sector'].str.replace('dlf phase 2','sector 25')
df['sector'] = df['sector'].str.replace('sushant lok phase 1','sector 43')

In [None]:
df['sector'] = df['sector'].str.replace('laxman vihar','sector 43')

In [None]:
import re

def extract_sector(x):
    
    match = re.search(r'sector\s\d+', x)
    
    if match:
        extracted_text = match.group()
        return extracted_text
    else:
        return x

In [None]:
df['sector'] = df['sector'].apply(extract_sector)

In [None]:
df['sector'] = df['sector'].str.replace('sector 105, sector-105', 'sector 105')


In [None]:
df['sector'] = df['sector'].str.replace('block g sector-57', 'sector 57')
df['sector'] = df['sector'].str.replace('sector-7 hous', 'sector 7')
df['sector'] = df['sector'].str.replace('sector-5 sheetla colony', 'sector 5')

In [None]:
df['sector'] = df['sector'].str.replace('laxman vihar','Sector 4')
df['sector'] = df['sector'].str.replace('dlf phase 4','Sector 27')
df['sector'] = df['sector'].str.replace('dlf phase 3','Sector 24')
df['sector'] = df['sector'].str.replace('sushant lok phase 3','Sector 57')
df['sector'] = df['sector'].str.replace('rajendra park','Sector 105')
df['sector'] = df['sector'].str.replace('gwal pahari','Sector 79')
df['sector'] = df['sector'].str.replace('dlf phase 5','Sector 53')
df['sector'] = df['sector'].str.replace('laxman vihar phase 2','Sector 4')
df['sector'] = df['sector'].str.replace('uppals southend','Sector 49')
df['sector'] = df['sector'].str.replace('ashok vihar phase 3 extension','Sector 35')
df['sector'] = df['sector'].str.replace('ashok vihar phase 2','Sector 37')
df['sector'] = df['sector'].str.replace('south city 1','Sector 41')
df['sector'] = df['sector'].str.replace('mehrauli  road','Sector 14')
df['sector'] = df['sector'].str.replace('dwarka expressway','Sector 104')
df['sector'] = df['sector'].str.replace('malibu town','Sector 47')
df['sector'] = df['sector'].str.replace('surat nagar 1','Sector 104')
df['sector'] = df['sector'].str.replace('new colony','Sector 7')
df['sector'] = df['sector'].str.replace('mianwali colony','Sector 12A')
df['sector'] = df['sector'].str.replace('rajiv nagar','Sector 13')
df['sector'] = df['sector'].str.replace('ashok vihar','Sector 5')
df['sector'] = df['sector'].str.replace('jyoti park','Sector 7')
df['sector'] = df['sector'].str.replace('jacobpura','Sector 12A')
df['sector'] = df['sector'].str.replace('greenwood city','Sector 45')
df['sector'] = df['sector'].str.replace('subhash nagar','Sector 10')
df['sector'] = df['sector'].str.replace('dayanand colony','Sector 7')
df['sector'] = df['sector'].str.replace('manesar','Sector 1')
df['sector'] = df['sector'].str.replace('saraswati vihar','Sector 10')
df['sector'] = df['sector'].str.replace('arjun nagar','Sector 52')
df['sector'] = df['sector'].str.replace('sushant lok phase 2','Sector 56')
df['sector'] = df['sector'].str.replace('ansal plaza','Sector 23')
df['sector'] = df['sector'].str.replace('chakkarpur','Sector 28')
df['sector'] = df['sector'].str.replace('krishna colony','Sector 7')
df['sector'] = df['sector'].str.replace('vishnu garden','Sector 105')
df['sector'] = df['sector'].str.replace('madanpuri','Sector 7')
df['sector'] = df['sector'].str.replace('shivaji nagar','Sector 9')
df['sector'] = df['sector'].str.replace('dharam colony','Sector 9')
df['sector'] = df['sector'].str.replace('valley view estate','Sector 46')
df['sector'] = df['sector'].str.replace('devilal colony','Sector 9')
df['sector'] = df['sector'].str.replace('adarsh nagar','Sector 12')
df['sector'] = df['sector'].str.replace('shivpuri','Sector 9')
df['sector'] = df['sector'].str.replace('surya vihar','Sector 21')
df['sector'] = df['sector'].str.replace('new','Unknown')
df['sector'] = df['sector'].str.replace('imt manesar','IMT Manesar')
df['sector'] = df['sector'].str.replace('bhim nagar','Sector 6')
df['sector'] = df['sector'].str.replace('suncity','Sector 54')
df['sector'] = df['sector'].str.replace('ravi nagar','Sector 23')
df['sector'] = df['sector'].str.replace('baldev nagar','Sector 9')
df['sector'] = df['sector'].str.replace('garhi harsaru','Garhi Harsaru')
df['sector'] = df['sector'].str.replace('bhondsi','Bhondsi')
df['sector'] = df['sector'].str.replace('laxmi garden','Sector 9')
df['sector'] = df['sector'].str.replace('prem nagar','Sector 13')
df['sector'] = df['sector'].str.replace('gandhi nagar','Sector 9')
df['sector'] = df['sector'].str.replace('shakti nagar','Sector 10')
df['sector'] = df['sector'].str.replace('maruti kunj','Sector 67')
df['sector'] = df['sector'].str.replace('mg road','Sector 28')
df['sector'] = df['sector'].str.replace('patel nagar','Sector 15')
df['sector'] = df['sector'].str.replace('iffco chowk','Sector 29')
df['sector'] = df['sector'].str.replace('acharya puri','Sector 12')
df['sector'] = df['sector'].str.replace('surat nagar','Sector 104')
df['sector'] = df['sector'].str.replace('ambience island','Ambience Island')
df['sector'] = df['sector'].str.replace('west rajiv nagar','Sector 6')
df['sector'] = df['sector'].str.replace('sheetla colony','Sector 5')
df['sector'] = df['sector'].str.replace('farukhnagar','Farukhnagar')
df['sector'] = df['sector'].str.replace('golf course road','Golf Course Road')
df['sector'] = df['sector'].str.replace('bhawani enclave','Sector 9')
df['sector'] = df['sector'].str.replace('surat nagar phase 2','Sector 104')
df['sector'] = df['sector'].str.replace('pratap nagar','Sector 8')
df['sector'] = df['sector'].str.replace('manohar nagar','Sector 7')
df['sector'] = df['sector'].str.replace('rajiv chowk','Sector 33')
df['sector'] = df['sector'].str.replace('block a surya vihar','Sector 21')
df['sector'] = df['sector'].str.replace('mayfield garden','Mayfield Garden')
df['sector'] = df['sector'].str.replace('rosewood','Sector 49')
df['sector'] = df['sector'].str.replace('lajpat nagar','Sector 15')
df['sector'] = df['sector'].str.replace('bissar','Sector 3')
df['sector'] = df['sector'].str.replace('mohan nagar','Sector 13')
df['sector'] = df['sector'].str.replace('samaspur','Sector 51')
df['sector'] = df['sector'].str.replace('nars','Unknown')
df['sector'] = df['sector'].str.replace('daulatabad','Daulatabad')
df['sector'] = df['sector'].str.replace('hira nagar','Sector 15')
df['sector'] = df['sector'].str.replace('sushant lok','Sushant Lok')
df['sector'] = df['sector'].str.replace('new basti','Unknown')
df['sector'] = df['sector'].str.replace('shanti nagar','Sector 10')
df['sector'] = df['sector'].str.replace('sispal vihar','Sector 49')
df['sector'] = df['sector'].str.replace('golf course ext road','Golf Course Extension Road')
df['sector'] = df['sector'].str.replace('shankar vihar','Sector 4')
df['sector'] = df['sector'].str.replace('faridabad road','Faridabad Road')
df['sector'] = df['sector'].str.replace('damdma','Damdama')
df['sector'] = df['sector'].str.replace('laxman vihar, railway road,','Sector 3A')
df['sector'] = df['sector'].str.replace('new jyoti park','Sector 9')
df['sector'] = df['sector'].str.replace('dhankot','Dhankot')
df['sector'] = df['sector'].str.replace('mahavir pura','Sector 4')
df['sector'] = df['sector'].str.replace('g block dlf city phase 1','Sector 28')
df['sector'] = df['sector'].str.replace('naharpur rupa','Sector 111')
df['sector'] = df['sector'].str.replace('park view','Sector 3')
df['sector'] = df['sector'].str.replace('ravi nager basai road','Sector 9')
df['sector'] = df['sector'].str.replace('begampur khatola','Begampur Khatola')
df['sector'] = df['sector'].str.replace('sadar bazar','Sector 9')
df['sector'] = df['sector'].str.replace('near nand hospital','Unknown')
df['sector'] = df['sector'].str.replace('devi lal colony gali no 12 gurgaon nr omkar public school','Sector 9')
df['sector'] = df['sector'].str.replace('alipur','Alipur')
df['sector'] = df['sector'].str.replace('mahalaxmi garden, rajendra place','Unknown')
df['sector'] = df['sector'].str.replace('saraswati enclave','Sector 28')
df['sector'] = df['sector'].str.replace('sec 3 bhim vihar part 1 bhimgher kheri','Sector 5')
df['sector'] = df['sector'].str.replace('near euro','Unknown')
df['sector'] = df['sector'].str.replace('bhimgarh kheri phase 3','Unknown')
df['sector'] = df['sector'].str.replace('huda saraswati vihar','Sector 10')
df['sector'] = df['sector'].str.replace('laxmna vihar, phase 2','Sector 37')
df['sector'] = df['sector'].str.replace('rajiv colony','Sector 9')
df['sector'] = df['sector'].str.replace('south city','Sector 41')
df['sector'] = df['sector'].str.replace('v block dlf phase 3','Sector 24')
df['sector'] = df['sector'].str.replace('rattan garden','Unknown')
df['sector'] = df['sector'].str.replace('ashok vihar phase 3','Sector 35')
df['sector'] = df['sector'].str.replace('udyog vihar phase 1','Udyog Vihar Phase 1')
df['sector'] = df['sector'].str.replace('shivji park colony','Unknown')
df['sector'] = df['sector'].str.replace('near pataudi chowk','Unknown')
df['sector'] = df['sector'].str.replace('bhora kalan','Bhora Kalan')
df['sector'] = df['sector'].str.replace('ats marigold','ATS Marigold')
df['sector'] = df['sector'].str.replace('civil l','Sector 2')
df['sector'] = df['sector'].str.replace('ashok vihar phase i','Sector 3A')
df['sector'] = df['sector'].str.replace('block h ashok vihar phase iii extension','Sector 5')
df['sector'] = df['sector'].str.replace('sultanpur','Sultanpur')
df['sector'] = df['sector'].str.replace('carterpuri village','Carterpuri')
df['sector'] = df['sector'].str.replace('sai kunj','Sai Kunj')
df['sector'] = df['sector'].str.replace('khandsa','Khandsa')
df['sector'] = df['sector'].str.replace('heritage city','Heritage City')
df['sector'] = df['sector'].str.replace('near khandsa road','Unknown')
df['sector'] = df['sector'].str.replace('kanahi','Kanahi')
df['sector'] = df['sector'].str.replace('lucknow','Lucknow')
df['sector'] = df['sector'].str.replace('pataudi road','Pataudi Road')
df['sector'] = df['sector'].str.replace('saroop garden rajendra park','Sector 105')
df['sector'] = df['sector'].str.replace('kheri','Kheri')
df['sector'] = df['sector'].str.replace('jat colony,fazilpur/taj nagar road','Unknown')
df['sector'] = df['sector'].str.replace('gopal nagar, garhi ghasita, sonipat','Unknown')
df['sector'] = df['sector'].str.replace('vir nagar','Vir Nagar')
df['sector'] = df['sector'].str.replace('hno 13 sagar enclave sura','Unknown')
df['sector'] = df['sector'].str.replace('ram nagar','Ram Nagar')
df['sector'] = df['sector'].str.replace('om nagar','Sector 7')
df['sector'] = df['sector'].str.replace('garauli kalan','Garauli Kalan')
df['sector'] = df['sector'].str.replace('tekchand nagar','Tekchand Nagar')
df['sector'] = df['sector'].str.replace('cyber city','Cyber City')
df['sector'] = df['sector'].str.replace('jacobpura near sadar bazar','Sector 12A')
df['sector'] = df['sector'].str.replace('bptp','BPTP')


In [None]:
df['sector'].value_counts()

In [None]:
df = df[df['sector'].str.contains(r'\bsector\b', case=False, na=False)]

In [None]:
def clean_sector(x):
    match = re.search(r'sector\s+(\d+)', x)
    
    if match:
        extract_text = match.group(0)
        return extract_text
    else:
        return x

In [None]:
df['sector'] = df['sector'].str.strip().str.lower().apply(clean_sector)
df['sector'] = df['sector'].str.replace('old delhi road, opp sector-14', 'sector 14')
df['sector'] = df['sector'].str.replace('anand garden, sector-105', 'sector 105')
df['sector'] = df['sector'].str.replace('sector 24', 'sector 23')
df['sector'] = df['sector'].str.replace('sector 42', 'sector 43')

In [None]:
df['sector'].unique()

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.drop(columns=['property_name','address','description','rating'], inplace=True)

In [None]:
df.head()

In [None]:
# feature engineering required => areaWithType, facing, agePassession, furnishDetails, features

### Feature Engineering

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.head(2)

Focus on => areaWithType, additionalRoom, agePossession, furnishDetails, features 

## 1. areaWithType

In [None]:
df[['area','areaWithType']].sample(5)

In [None]:
# this function extract super buldup area
def super_buildup_area(text):
    match = re.search(r'Super Built up area (\d+\.?\d*)',text)
    if match:
        return float(match.group(1))
    return None

In [None]:
# this fuction extract buildup area or corpet area
def get_area(text,area_type):
    match = re.search(area_type + r'\s*:\s*(\d+\.?\d*)',text)
    if match:
        return float(match.group(1))
    return None

In [None]:
# this function check if the area is provided in sq m and converts into sq ft if needed
def convert_to_sqft(text, area_value):
    if area_value is None:
        return None
    match = re.search(r'{} \((\d+\.?\d*) sq.m.\)'.format(area_value), text)
    if match:
        sq_m_value = float(match.group(1))
        return sq_m_value * 10.7639
    return area_value

In [None]:
# Extract super built up area and conver sqft if needed 
df['super_built_up_area'] = df['areaWithType'].apply(super_buildup_area)
df['super_built_up_area'] = df.apply(lambda x: convert_to_sqft(x['areaWithType'], x['super_built_up_area']), axis=1)

# Extract built up area and convert to sqft if needed
df['built_up_area'] = df['areaWithType'].apply(lambda x:get_area(x, 'Built Up area'))
df['built_up_area'] = df.apply(lambda x: convert_to_sqft(x['areaWithType'], x['built_up_area']), axis=1)

# Extract carpet area and convert to sqft if needed
df['carpet_area'] = df['areaWithType'].apply(lambda x:get_area(x,'Carpet area'))
df['carpet_area'] = df.apply(lambda x:convert_to_sqft(x['areaWithType'],x['carpet_area']), axis=1)

In [None]:
df[['price','Property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']].sample(5)

In [None]:
df[~((df['super_built_up_area'].isnull()) | (df['built_up_area'].isnull()) |(df['carpet_area'].isnull()))][['price','Property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']]

In [None]:
df[df['areaWithType'].str.contains('Plot')][['price','Property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']].shape

In [None]:
df.isnull().sum()

In [None]:
all_nan_df = df[((df['super_built_up_area'].isnull()) & (df['built_up_area'].isnull()) & (df['carpet_area'].isnull()))][['price','Property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']]

In [None]:
all_nan_index = df[((df['super_built_up_area'].isnull()) & (df['built_up_area'].isnull()) & (df['carpet_area'].isnull()))][['price','Property_type','area','areaWithType','super_built_up_area','built_up_area','carpet_area']].index

In [None]:
# Function to extract plot area from 'areaWithType' column

def extract_plot_area(area_with_type):
    match = re.search(r'Plot area (\d+\.?\d*)', area_with_type)
    return float(match.group(1)) if match else None

In [None]:
all_nan_df['built_up_area'] = all_nan_df['areaWithType'].apply(extract_plot_area)

In [None]:
all_nan_df

In [None]:
def convert_scale(row):
    if np.isnan(row['area']) or np.isnan(row['built_up_area']):
        return row['built_up_area']
    else:
        if round(row['area']/row['built_up_area']) ==9.0:
            return row['built_up_area'] * 9
        elif round(row['area']/row['built_up_area']) == 11:
            return row['built_up_area'] *10.7
        else:
            return row['built_up_area']

In [None]:
all_nan_df['built_up_area'] = all_nan_df.apply(convert_scale,axis=1)

In [None]:
all_nan_df

In [None]:
# update original dataframe
df.update(all_nan_df)

In [None]:
df.isnull().sum()

## 2. additionalRoom

In [None]:
df['additionalRoom'].value_counts()

In [None]:
# additional room
new_cols = ['servant room','study room','others','pooja room','store room']

# populate the new columns based on the 'additionalRoom' column

for col in new_cols:
    df[col] = df['additionalRoom'].str.contains(col).astype(int)

In [None]:
df.sample(5)[['additionalRoom','servant room','study room','others','pooja room','store room']]

## 3. agePossession

In [None]:
df['agePossession'].value_counts()

In [None]:
def categorize_age_possession(value):
    if pd.isna(value):
        return 'Undefined'
    value = str(value)
    if '0 to 1 Year Old' in value or "Within 6 months" in value or 'Within 3 months' in value:
        return "New Property"
    if '1 to 5 Year Old' in value:
        return 'Relatively New'
    if '5 to 10 Year Old' in value:
        return 'Moderately Old'
    if '10+ Year Old' in value:
        return 'Old Property'
    if 'Under Construction' in value or 'By' in value:
        return 'Under Construction'
    try:
        # for entry like 'May 2024'
        int(value.split(' ')[-1])
        return 'Under Construction'
    except:
        return 'Undefined'

In [None]:
df['agePossession'] = df['agePossession'].apply(categorize_age_possession)

In [None]:
df['agePossession'].value_counts()

## 4. furnishDetails

In [None]:
df.sample(5)[['furnishDetails','features']]

In [None]:
# Extract all unique furnishings from the furnishDetails column
all_furnishings = []
for details in df['furnishDetails'].dropna():
    furnishings = details.replace('[','').replace(']','').replace("'","").split(',')
    all_furnishings.extend(furnishings)
unique_furnishings = list(set(all_furnishings))

In [None]:
unique_furnishings

In [None]:
# Define a function to extract the count of a furnishing from the furnishingDetails
def get_furnishing_count(details, furnishing):
    if isinstance(details, str):
        if f"No {furnishing}" in details:
            return 0
        pattern = re.compile(f"(\d+) {furnishing}")
        match = pattern.search(details)
        if match:
            return int(match.group(1))
        elif furnishing in details:
            return 1
    return 0

# Simplify the furnishing list by removing "No" prefix and numbers
columns_to_include = [re.sub(r'No |\d+','',furnishing).strip() for furnishing in unique_furnishings]
columns_to_include = list(set(columns_to_include)) # get unique furnishings
columns_to_include = [furnishing for furnishing in columns_to_include if furnishing] # remove empaty string

# create new columns for each unique furnishing and papulate with counts
for furnishing in columns_to_include:
    df[furnishing] = df['furnishDetails'].apply(lambda x : get_furnishing_count(x, furnishing))
    
# create the new dataframe with the required columns
furnishing_df = df[['furnishDetails'] + columns_to_include]

In [None]:
furnishing_df.shape

In [None]:
furnishing_df.drop(columns=['furnishDetails'], inplace=True)

In [None]:
furnishing_df.sample(5)

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(furnishing_df)

In [None]:
wcss_reduced = []

for i in range(1,11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(scaled_data)
    wcss_reduced.append(kmeans.inertia_)

In [None]:
# plot the result

plt.figure(figsize=(12,8))
plt.plot(range(1,11),wcss_reduced,marker='o', linestyle = '-')
plt.title('Find number of cluster')
plt.xlabel('Number of Cluster')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()

In [None]:
n_clusters = 3

# fit the KMeans model

kmeans = KMeans(n_clusters=n_clusters,random_state=42)
kmeans.fit(scaled_data)

# predict the cluster assignments for each row

cluster_assignments = kmeans.predict(scaled_data)

In [None]:
df =df.iloc[:,:-18]

In [None]:
df['furnishing_type'] = cluster_assignments

In [None]:
df.sample(5)[['furnishDetails','furnishing_type']]
#0 => unfurnished
#1 => semi-furnished
#2 => furnished

## 5. feature

In [None]:
df[['society','features']].sample(5)

In [None]:
df['features'].isnull().sum()

In [None]:
app_df = pd.read_excel('real_estate_data.xlsx')
app_df.head(2)

In [None]:
app_df['PropertyName'] = app_df['PropertyName'].str.lower()

In [None]:
temp_df = df[df['features'].isnull()]

In [None]:
temp_df.shape

In [None]:
temp_df

In [None]:
x = temp_df.merge(app_df,left_on='society',right_on='PropertyName',how='left')['TopFacilities']

In [None]:
x.values

In [None]:
df.loc[temp_df.index].shape

In [None]:
df.loc[temp_df.index,'features'] = x.values

In [None]:
x.shape

In [None]:
df['features'].isnull().sum()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
import ast

In [None]:
# convert the string representation of lists in the 'features' column to actual lists

df['features_list'] = df['features'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) and x.startswith('[') else [])

# Use multilabelBinarizer to convert the features list into a binary metrix
mlb = MultiLabelBinarizer()
features_binary_metrix = mlb.fit_transform(df['features_list'])

# convert the binary metrix into Dataframe
features_binary_df = pd.DataFrame(features_binary_metrix,columns=mlb.classes_)

In [None]:
features_binary_df.sample(5)

In [None]:
# Define the weights for each feature as provided
# Assigning weights based on perceived luxury contribution
weights = {
     '24/7 Power Backup': 8,
    '24/7 Water Supply': 4,
    '24x7 Security': 7,
    'ATM': 4,
    'Aerobics Centre': 6,
    'Airy Rooms': 8,
    'Amphitheatre': 7,
    'Badminton Court': 7,
    'Banquet Hall': 8,
    'Bar/Chill-Out Lounge': 9,
    'Barbecue': 7,
    'Basketball Court': 7,
    'Billiards': 7,
    'Bowling Alley': 8,
    'Business Lounge': 9,
    'CCTV Camera Security': 8,
    'Cafeteria': 6,
    'Car Parking': 6,
    'Card Room': 6,
    'Centrally Air Conditioned': 9,
    'Changing Area': 6,
    "Children's Play Area": 7,
    'Cigar Lounge': 9,
    'Clinic': 5,
    'Club House': 9,
    'Concierge Service': 9,
    'Conference room': 8,
    'Creche/Day care': 7,
    'Cricket Pitch': 7,
    'Doctor on Call': 6,
    'Earthquake Resistant': 5,
    'Entrance Lobby': 7,
    'False Ceiling Lighting': 6,
    'Feng Shui / Vaastu Compliant': 5,
    'Fire Fighting Systems': 8,
    'Fitness Centre / GYM': 8,
    'Flower Garden': 7,
    'Food Court': 6,
    'Foosball': 5,
    'Football': 7,
    'Fountain': 7,
    'Gated Community': 7,
    'Golf Course': 10,
    'Grocery Shop': 6,
    'Gymnasium': 8,
    'High Ceiling Height': 8,
    'High Speed Elevators': 8,
    'Infinity Pool': 9,
    'Intercom Facility': 7,
    'Internal Street Lights': 6,
    'Internet/wi-fi connectivity': 7,
    'Jacuzzi': 9,
    'Jogging Track': 7,
    'Landscape Garden': 8,
    'Laundry': 6,
    'Lawn Tennis Court': 8,
    'Library': 8,
    'Lounge': 8,
    'Low Density Society': 7,
    'Maintenance Staff': 6,
    'Manicured Garden': 7,
    'Medical Centre': 5,
    'Milk Booth': 4,
    'Mini Theatre': 9,
    'Multipurpose Court': 7,
    'Multipurpose Hall': 7,
    'Natural Light': 8,
    'Natural Pond': 7,
    'Park': 8,
    'Party Lawn': 8,
    'Piped Gas': 7,
    'Pool Table': 7,
    'Power Back up Lift': 8,
    'Private Garden / Terrace': 9,
    'Property Staff': 7,
    'RO System': 7,
    'Rain Water Harvesting': 7,
    'Reading Lounge': 8,
    'Restaurant': 8,
    'Salon': 8,
    'Sauna': 9,
    'Security / Fire Alarm': 9,
    'Security Personnel': 9,
    'Separate entry for servant room': 8,
    'Sewage Treatment Plant': 6,
    'Shopping Centre': 7,
    'Skating Rink': 7,
    'Solar Lighting': 6,
    'Solar Water Heating': 7,
    'Spa': 9,
    'Spacious Interiors': 9,
    'Squash Court': 8,
    'Steam Room': 9,
    'Sun Deck': 8,
    'Swimming Pool': 8,
    'Temple': 5,
    'Theatre': 9,
    'Toddler Pool': 7,
    'Valet Parking': 9,
    'Video Door Security': 9,
    'Visitor Parking': 7,
    'Water Softener Plant': 7,
    'Water Storage': 7,
    'Water purifier': 7,
    'Yoga/Meditation Area': 7
}
# Calculate luxury score for each row
luxury_score = features_binary_df[list(weights.keys())].multiply(list(weights.values())).sum(axis=1)

In [None]:
df['luxury_score'] = luxury_score

In [None]:
df['luxury_score'].fillna(0, inplace=True)

In [None]:
df.head()

In [None]:
# cols to drap -> nearbylocations, furnishDetails, features, features_list, assitionalRoom
df.drop(columns=['nearbyLocations','furnishDetails','features','features_list','additionalRoom'], inplace=True)

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df['luxury_score'].isnull().sum()

## EDA (Exploratory Data Analysis)

### Univeriant Analysis

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.head()

### Property_type

In [None]:
df['Property_type'].value_counts().plot(kind='bar')

#### Observations:
- Flats are in majority(75 percent) and there are less number of houses(~25 percent)
- No missing values

### Society

In [None]:
df['society'].value_counts().shape

In [None]:
df['society'].value_counts()

In [None]:
df[df['society'] != 'independent']['society'].value_counts(normalize=True).cumsum().head(75)

In [None]:
society_count = df['society'].value_counts()

#Frequency distribution for societies

frequency_bins = {
    'Very High (>100)':(society_count >100).sum(),
    'High (50-100)': (society_count>=50).sum(),
    'Average(10-49)': (society_count>=10).sum(),
    'Low (2-9)':(society_count<10).sum(),
    'Very Low (1)':(society_count == 1).sum()
}

In [None]:
frequency_bins

In [None]:
#Top 10 socities
df[df['society'] != 'independent']['society'].value_counts().head(10).plot(kind='bar')

In [None]:
df['society'].isnull().sum()

### Observations:

- Around 13% properties comes under independent tag
- There are 702 societies
- The top 75 societies have 50% of the properties and the rest of percent of the properties comes under the remaining 600 societies
    - Very High(>100): Only 1 society has more than 100 entries
    - High (50-100): 3 societies have 50-100 entries
    - Average(10-49): 98 societies have 10-49 entries
    - Low(2-9): 605 societies have 2 to 9 entries
    - Very Low (1): 308 socities have only 1 entries

### Sector

In [None]:
# unique Sectors
df['sector'].value_counts().shape

In [None]:
# Top 10 Sectors
df['sector'].value_counts().head(10).plot(kind='bar')

In [None]:
# Frequency distribution for sectors

sector_counts = df['sector'].value_counts()

sector_frequency_bins = {
    'Very High (>100)': (sector_counts > 100).sum(),
    'High (50-100)': ((sector_counts >= 50) & (sector_counts <= 100)).sum(),
    'Average (10-49)':((sector_counts >= 10) & (sector_counts < 50)).sum(),
    'Low (2-9)': ((sector_counts >=2) & (sector_counts < 10)).sum(),
    'Very Low (1)': (sector_counts == 1).sum()
}

In [None]:
sector_frequency_bins

### Observations:

- There are total of 98 sectors in the dataset
- Frequency distribution of the sector:
    - Very High (>100): 6 sectors have more than 100 entries
    - High (50-100) : 23 sectors have entries between 50 to 100
    - Average (10-49): 52 sectors have entries between 10 to 49
    - Low (2-9): 16 sectors have entries between 2 to 9
    - Very Low (1): 1 sectors have only one entry

## Price

In [None]:
df['price'].isnull().sum()

In [None]:
df['price'].describe()

In [None]:
sns.histplot(df['price'], kde=True, bins=50)

In [None]:
sns.boxplot(x=df['price'], color='lightgrey')
plt.grid()

### Descriptive Statistics:

- Count There are 3747 non-missing price listing
- Mean Price: The average price approximately 2.53 crores.
- Median Price: The median (or 50th percentile) price is 1.52 crores.
- Standard Deviation:The prices have a standard deviation of 2.98, indicating data variability in the prices.
- Range: Prices range from a minimum of 0.07 crores to a maximum of 31.5 crores.
- IQR: The Interquartile range (difference between 75th and 25th percentile) is from 0.95 crores to 2.75 crores.

### Visualizations:

- Distribution: The histogram indicates the most properties are priced in the lower range (below 5 crores), with a few proparties going beyond 10 crores.
- Box Plot: The box plot showcases the spread of the data and potential outliers. Properties priced above approximately 10 crores might be considered outliers as they lie beyond the upper whisker of the box plot.

#### Missing Values: There are 18 missing values in the price column.

In [None]:
# Skewness and Kurtosis
skewness = df['price'].skew()
kurtosis = df['price'].kurt()

print(skewness, kurtosis)

<b>Skewness</b>:The price distribution has a skewness of approximately 3.31, indicating a positive skew. This means that the distribution tail is skewed to right, which aligns with our observation from the histogram where most properties have prices on the lower end  with a few high-priced properties.

<b>Kurtosis</b>: The kurtosis value is approximately 15.25. A kurtosis value greater than 3 indicates distribution with heavier tails and more outliers compared to a normal distribution.

In [None]:
# Quantile Analysis
quantiles = df['price'].quantile([0.01, 0.05, 0.95, 0.99])

quantiles

### Quantile Analysis:

- 1% Quantile: Only 1% of properties are priced below 0.25 crores.
- 5% Quantile: 5% of properties are priced below 0.37 crores.
- 95% Quantile: 95% of properties are priced below 8.48 crores.
- 99% Quantile: 99% of properties are priced below 15.04 crores, indicating that very few properties are priced above this value.

In [None]:
# Identify potential outliers using IQR method

Q1 = df['price'].describe()['25%']
Q3 = df['price'].describe()['75%']

IQR = Q3-Q1
IQR

In [None]:
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

print(lower_bound, upper_bound)

In [None]:
outliers = df[(df['price'] <lower_bound) | (df['price'] > upper_bound)]
outliers.shape

In [None]:
outliers['price'].describe()

### Outliers Analysis (using IQR method):

- Based on the IQR method, there are 440 properties considered as outliers.
- These outliers have an average price of approximately 9.17 crores.
- The range for these outliers is from 6.44 crores to 31.5 crores

In [None]:
# price binning
bins = [0, 1, 2, 3, 5, 10, 20, 50]
bin_labels = ['0-1', '1-2', '2-3', '3-5', '5-10', '10-20', '20-50']
pd.cut(df['price'],bins=bins, labels=bin_labels, right=True).value_counts().sort_index().plot(kind='bar')

- The majority of properties are priced in the '1-2 crores' and '2-3 crores' ranges
- There's a significant drop in the number of properties priced above '5 crores'

In [None]:
# ecdf plot
ecdf = df['price'].value_counts().sort_index().cumsum()/len(df['price'])
plt.plot(ecdf.index, ecdf, marker='.', linestyle='none')
plt.grid()

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.distplot(df['price'], kde=True, bins=50,color='skyblue')
plt.title('Distribution of prices (Original Data)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1,2,2)
sns.distplot(np.log1p(df['price']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of price (Log Transformed)')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

np.log1p(x). This function computes the natural logarithm of 1+x. It's designed to provide more accurate results for values of x that are very close to zero.

Using np.log1p helps in transforming the price column while ensuring the any any value (including zero, if present) is handed appropriately.When we need to reverse the transformation, we can use np.expm1 which is computes e^x-1.

In [None]:
skewness = np.log1p(df['price']).skew()
kurtosis = np.log1p(df['price']).kurt()

print(skewness, kurtosis)

In [None]:
## Box Plot with log transformed data

plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(df['price'], color='skyblue',orient='h')
plt.title('Distribution of Price (Original)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with Log transformation
plt.subplot(1, 2, 2)
sns.boxplot(np.log1p(df['price']), color='lightgreen',orient='h')
plt.title('Distribution of Price (Log Transformed Data)')
plt.xlabel('Log (Price)')
plt.ylabel('Frequency')

# plt.tight_layout()
plt.show()

### Price_per_sqFeets

In [None]:
df['Price_Per_sqFeets'].isnull().sum()

In [None]:
df['Price_Per_sqFeets'].describe()

In [None]:
sns.histplot(df['Price_Per_sqFeets'], bins=50, color='skyblue', kde=True)

Most properties have a price_per_sqFeets ranging between approximately 0 and 40,000. There is a significant concentration in the lower range, with few properties having exceptionally high price_per_sqfeets.

In [None]:
sns.boxplot(df['Price_Per_sqFeets'], color='lightgreen', orient='h')

The box plot clearly shows several outliers, especially on the higher side. The interquartile range (IQR) is relativaly compact, but there are many data point beyond the 'whiskers' of the box plot, indicating potential outliers

### Observation:

- Potential Outliers
- Right skewed data
- 19 missing values

## bedRoom

In [None]:
df['bedRoom'].isnull().sum()

In [None]:
df['bedRoom'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['bedRoom'].value_counts(normalize=True).head().plot(kind='pie',autopct='%0.2f%%')

### Bathroom

In [None]:
df['bathroom'].isnull().sum()

In [None]:
df['bathroom'].value_counts()

In [None]:
df['bathroom'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['bathroom'].value_counts(normalize=True).sort_index().head().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df.head()

In [None]:
df['balcony'].isnull().sum()

In [None]:
df['balcony'].value_counts()

In [None]:
df['balcony'].value_counts().head().plot(kind='bar')

In [None]:
df['balcony'].value_counts(normalize=True).plot(kind='pie', autopct= '%0.2f%%')

## floornum

In [None]:
df['floorNum'].isnull().sum()

In [None]:
df['floorNum'].describe()

In [None]:
df['floorNum'].value_counts().sort_index().plot(kind='bar')

In [None]:
sns.boxplot(df['floorNum'],color='lightgreen',orient='h')

### Observations:

- The majority of the properties lie between the ground floor (0) and the 25th floor
- Floors 1 to 4 are particularly common, with the 3rd floor being the most frequent.
- There are a few proparties located at higher floors, but their frequency is much lower
- The box plot reveal that the majority of the properties are concertrated around the lower floors. The interquartile range (IQR) lies between approximately the 2nd and 10th floors.
- Data points beyond the 'whiskers' of the box plot, especially on the higher side, indicate potential outliers.

### Facing

In [None]:
df['facing'].isnull().sum()

In [None]:
df['facing'].fillna('NA',inplace=True)

In [None]:
df['facing'].value_counts()

### agePossession

In [None]:
df['agePossession'].isnull().sum()

In [None]:
df['agePossession'].value_counts()

### areas

In [None]:
# super build up area
df['super_built_up_area'].isnull().sum()

In [None]:
df['super_built_up_area'].describe()

In [None]:
sns.histplot(df['super_built_up_area'].dropna(), bins=50, color='skyblue',kde= True)

In [None]:
sns.boxplot(df['super_built_up_area'], color='lightgreen',orient='h')

#### Observation:
- Most properties have a super built-up area ranging between approximately 1,000 sq. feets and 2500 sq.feets.
- There are a few properties with a significantly larger area, leading to a right skewed distribution.
- The Interquartile Range (IQR) is a between roughly 1,480 sq. feets and 2,215 sq. feets, indicating that the middle 50% of the properties fall within this range.
- There are serval data points beyond the upper "whisker" of box plot, indicating potential outliers. These are properties with an unusually large super built-up area

In [None]:
# built-up area
df['built_up_area'].isnull().sum()

In [None]:
df['built_up_area'].describe()

In [None]:
sns.histplot(df['built_up_area'].dropna(), bins=50, color='skyblue', kde=False)

In [None]:
sns.boxplot(df['built_up_area'],color='lightgreen',orient='h')

#### Observation:
- Most properties have a built-up area ranging roughly between 500 sq.feets and 3,500 sq feets.
- There are very few properties with a much larger built-up area leading to a highly right skewed distribution.
- The box plot confirms the presence of significant outliers on the higher side. The data's interquantile range (IQR) is relatively compact but the "whiskers" of the box plot are streched due to outliers.

The presence of extreme values, especially on the higher side, suggests that there may be outliers or data error. This could also be due to some properties being exceptionaly large, like a commercial complex or an entire building being listed.

In [None]:
# carpet area
df['carpet_area'].isnull().sum()

In [None]:
df['carpet_area'].describe()

In [None]:
sns.histplot(df['carpet_area'].dropna(), bins=50, color='skyblue',kde=False)

In [None]:
sns.boxplot(df['carpet_area'], color='lightgreen',orient='h')

In [None]:
df.iloc[:,16:]

### Additional rooms

In [None]:
plt.figure(figsize=(20,12))

#create a subplot of pie charts for each room type
for idx, room in enumerate(['study room', 'servant room', 'store room', 'pooja room', 'others'],1):
    ax = plt.subplot(2,3,idx)
    df[room].value_counts().plot(kind='pie',autopct='%1.1f%%',startangle=90, ax=ax)
    plt.title(f'Distribution of {room.title()}')
    plt.ylabel('')
    
plt.show()

### furnishing_type

In [None]:
df['furnishing_type'].isnull().sum()

In [None]:
df['furnishing_type'].value_counts()

In [None]:
df['furnishing_type'].value_counts().plot(kind='pie', autopct='%0.2f%%')

    => 2 is unfurnished
    => 1 is semi-furnished
    => 0 is unfurnished

### luxury score

In [None]:
df['luxury_score'].isnull().sum()

In [None]:
df['luxury_score'].describe()

In [None]:
sns.histplot(df['luxury_score'], bins=50, color='skyblue',kde=True)

In [None]:
sns.boxplot(df['luxury_score'], color='lightgreen', orient='h')

The luxury score distribution has multiple peeks, suggesting a multi-mode distribution. There's significant number of properties with lower luxury scrores (around 0-50), and other peak is observed around the 110-130 range.

The box plot reveals that the majority of the properties have luxury scores between approximately 30 and 110. The interquantile range (IQR) lies between these values.

### Multivariant Analysis

In [None]:
df.head()

## property_type vs price

In [None]:
sns.barplot(x=df['Property_type'],y=df['price'])

In [None]:
sns.boxplot(x=df['Property_type'], y=df['price'])

### Property_type vs Area

In [None]:
sns.barplot(x=df['Property_type'], y=df['area'])

In [None]:
df['area'].max()

In [None]:
# remove crazy outliers

df = df[df['area'] <= 14500.0]

In [None]:
sns.boxplot(x=df['Property_type'], y=df['area'])

### Property_type vs price_persqfeets

In [None]:
sns.barplot(x=df['Property_type'], y=df['Price_Per_sqFeets'], estimator=np.median)

In [None]:
sns.boxplot(x=df['Property_type'], y=df['Price_Per_sqFeets'])

In [None]:
sns.heatmap(pd.crosstab(df['Property_type'],df['bedRoom']))

In [None]:
sns.barplot(x=df['Property_type'],y=df['floorNum'])

In [None]:
sns.boxplot(x=df['Property_type'],y=df['floorNum'])

In [None]:
df.head()

In [None]:
sns.heatmap(pd.crosstab(df['Property_type'], df['agePossession']))

In [None]:
sns.heatmap(pd.pivot_table(df, index='Property_type',columns='agePossession', values='price', aggfunc='mean'), annot=True)

In [None]:
plt.figure(figsize=(15,4))
sns.heatmap(pd.pivot_table(df, index='Property_type', columns='bedRoom', values='price', aggfunc='mean'),annot=True)

In [None]:
sns.heatmap(pd.crosstab(df['Property_type'], df['furnishing_type']))

In [None]:
sns.heatmap(pd.pivot_table(df, index='Property_type', columns='furnishing_type', values='price'), annot=True)

In [None]:
sns.barplot(x=df['Property_type'],y=df['luxury_score'])

In [None]:
sns.boxplot(x=df['Property_type'], y= df['luxury_score'])

In [None]:
# sector analysis
plt.figure(figsize=(15,6))
sns.heatmap(pd.crosstab(df['Property_type'], df['sector']))

In [None]:
# sector analysis
import re
# Group by 'sector' and calculate the average price
avg_price_per_sector = df.groupby('sector')['price'].mean().reset_index()

# Function to extract sector numbers
def extract_sector_number(sector_name):
    match = re.search(r'\d+', sector_name)
    if match:
        return int(match.group())
    else:
        return float('inf') # Return a Large number for non-numbered sectors
    
avg_price_per_sector['sector_number'] = avg_price_per_sector['sector'].apply(extract_sector_number)

# sort by sector number
avg_price_per_sector_sorted_by_sector = avg_price_per_sector.sort_values(by='sector_number')

# Plote the heat map
plt.figure(figsize=(5,25))
sns.heatmap(avg_price_per_sector_sorted_by_sector.set_index('sector')[['price']],annot=True, fmt='.2f',linewidths=.5)
plt.title('Average price per sector (Sorted by sector number)')
plt.xlabel('Average Price')
plt.ylabel('sector')
plt.show()

In [None]:
import re
# Group by 'sector' and calculate the average price
avg_price_per_sector = df.groupby('sector')['Price_Per_sqFeets'].mean().reset_index()

# Function to extract sector numbers
def extract_sector_number(sector_name):
    match = re.search(r'\d+', sector_name)
    if match:
        return int(match.group())
    else:
        return float('inf') # Return a Large number for non-numbered sectors
    
avg_price_per_sector['sector_number'] = avg_price_per_sector['sector'].apply(extract_sector_number)

# sort by sector number
avg_price_per_sector_sorted_by_sector = avg_price_per_sector.sort_values(by='sector_number')

# Plote the heat map
plt.figure(figsize=(5,25))
sns.heatmap(avg_price_per_sector_sorted_by_sector.set_index('sector')[['Price_Per_sqFeets']],annot=True, fmt='.2f',linewidths=.5)
plt.title('Average price per sector (Sorted by sector number)')
plt.xlabel('Average Price')
plt.ylabel('sector')
plt.show()

In [None]:
# Group by 'sector' and calculate the average luxury score
avg_price_per_sector = df.groupby('sector')['luxury_score'].mean().reset_index()

# Function to extract sector numbers
def extract_sector_number(sector_name):
    match = re.search(r'\d+', sector_name)
    if match:
        return int(match.group())
    else:
        return float('inf') # Return a Large number for non-numbered sectors
    
avg_price_per_sector['sector_number'] = avg_price_per_sector['sector'].apply(extract_sector_number)

# sort by sector number
avg_price_per_sector_sorted_by_sector = avg_price_per_sector.sort_values(by='sector_number')

# Plote the heat map
plt.figure(figsize=(5,25))
sns.heatmap(avg_price_per_sector_sorted_by_sector.set_index('sector')[['luxury_score']],annot=True, fmt='.2f',linewidths=.5)
plt.title('Average price per sector (Sorted by sector number)')
plt.xlabel('Average Price')
plt.ylabel('sector')
plt.show()

### Price

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x=df[df['area']<10000]['area'],y=df['price'],hue=df['bedRoom'])

### Outliers Treated

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
sns.distplot(df['price'])

In [None]:
sns.boxplot(x=df['price'])

In [None]:
# Calculate the IQR for the 'price' column

Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3-Q1

# defind boundries
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR

# Identify Outliers
outliers = df[(df['price'] < lower_bound) |(df['price'] > upper_bound)]

# Display the number of outliers and some statistics
num_outliers = outliers.shape[0]
outliers_price_stats =outliers['price'].describe()

num_outliers, outliers_price_stats

In [None]:
outliers.sort_values('price',ascending=False).head(20)

In [None]:
# on the basis of price column is a genuanin data point

### Price_per_sqFeets

In [None]:
sns.distplot(df['Price_Per_sqFeets'])

In [None]:
sns.boxplot(x=df['Price_Per_sqFeets'])

In [None]:
# find Outliers using IQR method
Q1 = df['Price_Per_sqFeets'].quantile(0.25)
Q3 = df['Price_Per_sqFeets'].quantile(0.75)
IQR = Q3 - Q1

# Find boudries
lower_bound_pf = Q1 - 1.5*IQR
upper_bound_pf = Q3 + 1.5*IQR

outliers_price_sqfeets = df[(df['Price_Per_sqFeets'] < lower_bound_pf) | (df['Price_Per_sqFeets'] > upper_bound_pf)]

# Display the outliers and some statistics and description
num_outliers_pf = outliers_price_sqfeets.shape[0]
outliers_price_stats_pf = outliers_price_sqfeets['Price_Per_sqFeets'].describe()

num_outliers_pf, outliers_price_stats_pf

In [None]:
outliers_price_sqfeets['area'] = outliers_price_sqfeets['area'].apply(lambda x:x*9 if x<1000 else x)

In [None]:
outliers_price_sqfeets['Price_Per_sqFeets'] = round((outliers_price_sqfeets['price']*10000000)/outliers_price_sqfeets['area'])

In [None]:
outliers_price_sqfeets.head()

In [None]:
outliers_price_sqfeets['Price_Per_sqFeets'].describe()

In [None]:
df.update(outliers_price_sqfeets)

In [None]:
sns.distplot(df['Price_Per_sqFeets'])

In [None]:
sns.boxplot(x=df['Price_Per_sqFeets'])

In [None]:
df = df[df['Price_Per_sqFeets'] < 50000]

In [None]:
sns.boxplot(x=df['Price_Per_sqFeets'])

### Area

In [None]:
sns.distplot(df['area'])

In [None]:
sns.boxplot(x=df['area'])

In [None]:
df['area'].describe()

In [None]:
df[df['area'] > 100000].shape

In [None]:
df = df[df['area']<100000]

In [None]:
sns.distplot(df['area'])

In [None]:
sns.boxplot(x=df['area'])

In [None]:
df[df['area'] > 10000].sort_values('area', ascending=False)

In [None]:
df.drop(index=[3102, 2852, 1639, 3564, 3630], inplace=True)

In [None]:
df.drop(index=[3641], inplace=True)

In [None]:
df[df['area'] > 10000].sort_values('area',ascending=False)

In [None]:
df.loc[3283, 'area'] = 115*9
df.loc[3359, 'area'] = 7250
df.loc[3552, 'area'] = 5800
df.loc[3085, 'area'] = 2660
df.loc[3092, 'area'] = 2160
df.loc[2832, 'area'] = 1175
df.loc[3500, 'area'] = 3500

In [None]:
df.loc[3614, 'area'] = 2850

In [None]:
sns.distplot(df['area'])

In [None]:
sns.boxplot(x=df['area'])

In [None]:
df['area'].describe()

### BedRoom

In [None]:
sns.distplot(df['bedRoom'])

In [None]:
sns.boxplot(x=df['bedRoom'])

In [None]:
df['bedRoom'].describe()

In [None]:
df[df['bedRoom'] > 10].shape

In [None]:
df = df[df['bedRoom'] <= 10]

In [None]:
df.shape

In [None]:
df['bedRoom'].describe()

In [None]:
sns.distplot(df['bedRoom'])

In [None]:
sns.boxplot(x=df['bedRoom'])

In [None]:
sns.distplot(df['bathroom'])

In [None]:
sns.boxplot(x=df['bathroom'])

In [None]:
df[df['bathroom'] > 10].sort_values('bathroom', ascending=False)

In [None]:
df.head()

### Super built up area

In [None]:
sns.distplot(df['super_built_up_area'])

In [None]:
sns.boxplot(x=df['super_built_up_area'])

In [None]:
df['super_built_up_area'].describe()

In [None]:
df[df['super_built_up_area'] > 6000]

### built up area

In [None]:
sns.distplot(df['built_up_area'])

In [None]:
sns.boxplot(x=df['built_up_area'])

In [None]:
df = df[(df['built_up_area'] != 13500) | (df['built_up_area'] != 26000)]

In [None]:
df.shape

### carpet area

In [None]:
sns.distplot(df['carpet_area'])

In [None]:
sns.boxplot(x=df['carpet_area'])

In [None]:
df = df[df['carpet_area'] != 18122]

In [None]:
df.shape

In [None]:
sns.distplot(df['carpet_area'])

In [None]:
sns.distplot(df['luxury_score'])

In [None]:
sns.boxplot(x=df['luxury_score'])

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['Price_Per_sqFeets']= round((df['price']*10000000)/df['area'])

In [None]:
df.head()

In [None]:
sns.distplot(df['Price_Per_sqFeets'])

In [None]:
sns.boxplot(x=df['Price_Per_sqFeets'])

In [None]:
x = df[df['Price_Per_sqFeets'] <= 20000]
(x['area']/x['bedRoom']).quantile(0.05)

In [None]:
df[(df['area']/df['bedRoom']) < 250]

In [None]:
sns.lmplot(data=df, x='area', y='bedRoom')

In [None]:
df['area_room_ratio'] = df['area']/df['bedRoom']

In [None]:
(df[df['area_room_ratio'] < 250])['bedRoom'].value_counts()

In [None]:
df = df[df['area_room_ratio'] > 100]

In [None]:
outliers_df = df[(df['area_room_ratio'] < 250) & (df['bedRoom'] >3)]

In [None]:
outliers_df['bedRoom'] = round(outliers_df['bedRoom']/outliers_df['floorNum'])

In [None]:
df.update(outliers_df)

In [None]:
df['area_room_ratio'] = df['area']/df['bedRoom']

In [None]:
df[(df['area_room_ratio'] < 250) & (df['bedRoom'] > 4)]

In [None]:
df[(df['area_room_ratio'] < 250) & (df['bedRoom'] > 4)].shape

In [None]:
df = df[~((df['area_room_ratio'] < 250) & (df['bedRoom'] > 4))]

In [None]:
sns.lmplot(data=df, x='area', y='bedRoom')

In [None]:
df.shape

### Missing Values Handle

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

### Built up area

In [None]:
sns.scatterplot(x=df['built_up_area'],y=df['super_built_up_area'])

In [None]:
sns.scatterplot(x=df['built_up_area'],y=df['carpet_area'])

In [None]:
all_present_df = df[~((df['super_built_up_area'].isnull()) | (df['built_up_area'].isnull()) | (df['carpet_area'].isnull()))]

In [None]:
all_present_df.shape

In [None]:
super_to_built_up_ratio = (all_present_df['super_built_up_area']/all_present_df['built_up_area']).median()

In [None]:
carpet_to_built_up_ratio = (all_present_df['carpet_area']/all_present_df['built_up_area']).median()

In [None]:
print(super_to_built_up_ratio,carpet_to_built_up_ratio)

In [None]:
# both present built up null

sbc_df = df[~(df['super_built_up_area'].isnull()) & (df['built_up_area'].isnull()) & ~(df['carpet_area'].isnull())]

In [None]:
sbc_df['built_up_area'].fillna(round(((sbc_df['super_built_up_area']/1.105) + (sbc_df['carpet_area']/0.9))/2), inplace=True)

In [None]:
sbc_df

In [None]:
df.update(sbc_df)

In [None]:
df.isnull().sum()

In [None]:
# super built up area present, Carpet_area is null, bult_up_area is null
sb_df = df[~(df['super_built_up_area'].isnull()) & (df['built_up_area'].isnull()) & (df['carpet_area'].isnull())]

In [None]:
sb_df.head()

In [None]:
sb_df['built_up_area'].fillna(round(sb_df['super_built_up_area']/1.105), inplace=True)

In [None]:
df.update(sb_df)

In [None]:
df.isnull().sum()

In [None]:
ca_df = df[(df['super_built_up_area'].isnull()) & (df['built_up_area'].isnull()) & ~(df['carpet_area'].isnull())]

In [None]:
ca_df

In [None]:
ca_df['built_up_area'].fillna(round(ca_df['carpet_area']/0.9), inplace=True)

In [None]:
ca_df

In [None]:
df.update(ca_df)

In [None]:
df.isnull().sum()

In [None]:
sns.scatterplot(x=df['built_up_area'],y=df['price'])

In [None]:
anamoly_df = df[(df['built_up_area'] < 2000) & (df['price'] > 2.5)][['price','area','built_up_area']]

In [None]:
anamoly_df.shape

In [None]:
anamoly_df['built_up_area'] = anamoly_df['area']

In [None]:
anamoly_df

In [None]:
df.update(anamoly_df)

In [None]:
sns.scatterplot(x = df['built_up_area'], y= df['price'])

In [None]:
df.head()

In [None]:
df.drop(columns=['area','areaWithType','super_built_up_area','carpet_area','area_room_ratio'], inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df[df['floorNum'].isnull()]

In [None]:
df[df['Property_type'] == 'house']['floorNum'].median()

In [None]:
df['floorNum'].fillna(2.0, inplace=True)

In [None]:
df.isnull().sum()

### Facing

In [None]:
df['facing'].value_counts().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df.drop(columns=['facing'], inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum()

In [None]:
df[df['agePossession'] == 'Undefined']

In [None]:
def mode_based_imputation(row):
    if row['agePossession'] == 'Undefined':
        mode_value = df[(df['sector'] == row['sector']) & (df['Property_type'] == row['Property_type'])]['agePossession'].mode()
        if not mode_value.empty:
            return mode_value.iloc[0]
        else:
            return np.nan
    else:
        return row['agePossession']

In [None]:
df['agePossession'] = df.apply(mode_based_imputation,axis=1)

In [None]:
df['agePossession'].value_counts()

In [None]:
def mode_based_imputation2(row):
    if row['agePossession'] == 'Undefined':
        mode_value = df[(df['sector'] == row['sector'])]['agePossession'].mode()
        if not mode_value.empty:
            return mode_value.iloc[0]
        else:
            return np.nan
    else:
        return row['agePossession']

In [None]:
df['agePossession'] = df.apply(mode_based_imputation2, axis=1)

In [None]:
df['agePossession'].value_counts()

In [None]:
def mode_based_imputation3(row):
    if row['agePossession'] == 'Undefined':
        mode_value = df[(df['Property_type'] == row['Property_type'])]['agePossession'].mode()
        if not mode_value.empty:
            return mode_value.iloc[0]
        else:
            return np.nan
    else:
        return row['agePossession']

In [None]:
df['agePossession'] = df.apply(mode_based_imputation3,axis=1)

In [None]:
df['agePossession'].value_counts()

In [None]:
df.isnull().sum()

### Feature Selection

In [None]:
df.shape

In [None]:
df.head()

In [None]:
train_df = df.drop(columns=['society','Price_Per_sqFeets'])

In [None]:
train_df.head()

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(train_df.corr(), annot=True)

In [None]:
train_df.corr()['price'].sort_values(ascending=False)

### luxury score

In [None]:
sns.boxplot(df['luxury_score'], orient='h')

In [None]:
def categorize_luxury(score):
    if 0 <= score <50:
        return "Low"
    elif 50 <= score < 150:
        return "Medium"
    elif 150 <= score <= 175:
        return "High"
    else:
        return None

In [None]:
train_df['luxury_score'] = train_df['luxury_score'].apply(categorize_luxury)

In [None]:
train_df.head()

### floorNum

In [None]:
sns.boxplot(df['floorNum'], orient='h')

In [None]:
df['floorNum'].value_counts()

In [None]:
def categorize_floor(floor):
    if 0 <= floor <=2:
        return "Low Floor"
    elif 3 <= floor <= 10:
        return "Mid Floor"
    elif 11 <= floor <= 51:
        return "High Floor"
    else:
        return None

In [None]:
train_df['floor_category'] = train_df['floorNum'].apply(categorize_floor)

In [None]:
train_df.head()

In [None]:
train_df.drop(columns=['floorNum','luxury_score'], inplace=True)

In [None]:
train_df.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Create a copy of the original data for Label encoding
data_label_encoded = train_df.copy()

categorical_cols = train_df.select_dtypes(include=['object']).columns

# Apply Label encoding to categorical columns
for col in categorical_cols:
    oe = OrdinalEncoder()
    data_label_encoded[col] = oe.fit_transform(data_label_encoded[[col]])
    print(oe.categories_)
    
# Splitting the dataset into training and testing sets
x_label = data_label_encoded.drop('price', axis=1)
y_label = data_label_encoded['price']

In [None]:
x_label

## Feature Selection Technique-1 - Correlation Analysis

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(data_label_encoded.corr(), annot=True)

In [None]:
f1_df1 = data_label_encoded.corr()['price'].iloc[1:].to_frame().reset_index().rename(columns={'index':'feature','price':'corr_coeff'})
f1_df1

### Technique 2 - Random Forest Feature Importance

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest regression on lebel encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(x_label,y_label)

# Extract feature importance scores for Label encoded data
fi_df2 = pd.DataFrame({
    'feature':x_label.columns,
    'rf_importance': rf_label.feature_importances_
}).sort_values(by='rf_importance', ascending=False)

In [None]:
fi_df2

### Technique 3 - Gradient Boosting Feature importances

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Train a Gradient Boosting Regressor on Label encoded data
gb_label = GradientBoostingRegressor()
gb_label.fit(x_label,y_label)

# Extract feature importance scores for label encoded data
fi_df3 = pd.DataFrame({
    'feature': x_label.columns,
    'gb_importance': gb_label.feature_importances_
}).sort_values(by='gb_importance', ascending=False)

fi_df3

### Technique - 4 - Permutation Importance

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

x_train_label, x_test_label, y_train_label, y_test_label = train_test_split(x_label, y_label, test_size=0.2, random_state=42)

# Train a Random Forest Regrassor on label encoded data
rf_label = RandomForestRegressor(n_estimators=100, random_state=42)
rf_label.fit(x_train_label, y_train_label)

# Calculate Permutation Importance
perm_importance = permutation_importance(rf_label, x_test_label,y_test_label, n_repeats=30, random_state=42)

# Organize results into a DataFrame

fi_df4 = pd.DataFrame({
    'feature': x_label.columns,
    'permutation_importance': perm_importance.importances_mean
}).sort_values(by='permutation_importance', ascending=False)

In [None]:
fi_df4

### Technique 5 - LASSO

In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Standardize the features
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x_label)

# Train a LASSO regression model
# We'll use a relatively small value for alpha (the regularization strength) for demonstration purposes
lasso = Lasso(alpha=0.01, random_state=42)
lasso.fit(x_label, y_label)

#Extract coefficients
fi_df5 = pd.DataFrame({
    'feature': x_label.columns,
    'lasso_coeff': lasso.coef_
}).sort_values(by='lasso_coeff', ascending=False)

fi_df5

### Technique 6 - RFE

In [None]:
from sklearn.feature_selection import RFE

# Initialize the base estimator
estimator = RandomForestRegressor()

# Apply RFE on the label-encoded and standardized training data
selector_label = RFE(estimator, n_features_to_select=x_label.shape[1], step=1)
selector_label = selector_label.fit(x_label, y_label)

# Get the selected features based on REF
selected_features = x_label.columns[selector_label.support_]

# Extract the coefficients for the selected features from the underlying Linearregression model
selected_coefficients = selector_label.estimator_.feature_importances_

# Organize the results into a DataFrame
fi_df6 = pd.DataFrame({
    'feature': selected_features,
    'rfe_score': selected_coefficients
}).sort_values(by='rfe_score', ascending=False)

fi_df6

### Technique 7 - Linear Regression Weights

In [None]:
# Train a linear regression model on the Label encoded and standardized training data
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(x_scaled, y_label)

# Extract coefficients
fi_df7 = pd.DataFrame({
    'feature': x_label.columns,
    'reg_coeffs': lin_reg.coef_
}).sort_values(by='reg_coeffs', ascending=False)

fi_df7

In [None]:
final_fi_df = f1_df1.merge(fi_df2, on='feature').merge(fi_df3, on='feature').merge(fi_df4, on='feature').merge(fi_df5, on='feature').merge(fi_df6, on='feature').merge(fi_df7, on = 'feature').set_index('feature')

In [None]:
final_fi_df

In [None]:
# normalize the score
final_fi_df = final_fi_df.divide(final_fi_df.sum(axis=0), axis=1)

In [None]:
final_fi_df[['rf_importance','gb_importance','permutation_importance','rfe_score']].mean(axis=1).sort_values(ascending=False)

In [None]:
# to drop pooja room, study room, others
x_label

In [None]:
# with all the cols
from sklearn.model_selection import cross_val_score

rf = RandomForestRegressor(n_estimators=100, random_state=42)

scorrs = cross_val_score(rf, x_label, y_label, cv=5, scoring='r2')

In [None]:
scorrs.mean()

In [None]:
scores = cross_val_score(rf, x_label.drop(columns=['pooja room','study room','others']), y_label, cv=5, scoring='r2')

In [None]:
scores.mean()

In [None]:
export_df = x_label.drop(columns=['pooja room','study room','others'])
export_df['price'] = y_label

In [None]:
final_df = train_df.drop(columns=['pooja room','study room','others'])
final_df['price'] =y_label

In [None]:
final_df

In [None]:
final_df['luxury_category'] = df['luxury_score'].apply(categorize_luxury)

In [None]:
final_df

In [None]:
final_df.isnull().sum()

### Model Train and Select

In [None]:
final_df.head()

In [None]:
final_df.info()

In [None]:
final_df['furnishing_type'].value_counts()

In [None]:
# 2 -> unfurnished
# 0 -> semifurnished
# 1 -> furnished
final_df['furnishing_type'] = final_df['furnishing_type'].replace({2.0:'unfurnished',0.0:'semifurnished',1.0:'furnished'})

In [None]:
final_df.head()

In [None]:
X = final_df.drop(columns=['price'])
y = final_df['price']

In [None]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [None]:
X.isnull().sum()

### Ordinal Encoding

In [None]:
columns_to_encode = ['Property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [None]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(),scores.std()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(np.expm1(y_test),y_pred)

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_output

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### OneHotEncoding

In [None]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean()

In [None]:
scores.std()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(np.expm1(y_test),y_pred)

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### OneHotEncoding With PCA

In [None]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean()

In [None]:
scores.std()

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### Target Encoder

In [None]:
import category_encoders as ce

columns_to_encode = ['Property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
#!pip install category_encoders

In [None]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [None]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y=y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(),scores.std()

In [None]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [None]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [None]:
model_df.sort_values(['mae'])

### Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [None]:
columns_to_encode = ['Property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.best_score_

In [None]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [None]:
pipeline.fit(X,y_transformed)

In [None]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [None]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)

In [None]:
X

### Trying out the predictions

In [None]:
X.columns

In [None]:
X.iloc[0].values

In [None]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['Property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


In [None]:
np.expm1(pipeline.predict(one_df))