In [324]:
# importing the required libraries
import pandas as pd
import numpy as np
from pathlib import Path as path
import os

import warnings
warnings.filterwarnings('ignore')

In [325]:
# path 
home_dir = path.cwd().parent             # home directory
data_dir = home_dir / 'data'             # data directory
interim_data_dir = data_dir / 'interim'  # interim dataset directory

In [326]:
[i for i in interim_data_dir.iterdir()]

[WindowsPath('c:/Sada/Projects/real-estate-property-price-predictor/data/interim/.gitkeep'),
 WindowsPath('c:/Sada/Projects/real-estate-property-price-predictor/data/interim/flat-cleaned.csv'),
 WindowsPath('c:/Sada/Projects/real-estate-property-price-predictor/data/interim/flat-house-dataset.csv'),
 WindowsPath('c:/Sada/Projects/real-estate-property-price-predictor/data/interim/flat-house-dataset_v2.csv'),
 WindowsPath('c:/Sada/Projects/real-estate-property-price-predictor/data/interim/house-cleaned.csv')]

In [327]:
# list the dataset in the 'interim data' directory
# assign the paths to variables

flat_data_path = interim_data_dir / 'flat-cleaned.csv'
house_data_path = interim_data_dir / 'house-cleaned.csv'

# printing the paths
print(f"flat dataset path: {flat_data_path}")
print(f"house dataset path: {house_data_path}")

flat dataset path: c:\Sada\Projects\real-estate-property-price-predictor\data\interim\flat-cleaned.csv
house dataset path: c:\Sada\Projects\real-estate-property-price-predictor\data\interim\house-cleaned.csv


In [328]:
# reading the dataset with pandas 
flats = pd.read_csv(flat_data_path)
houses = pd.read_csv(house_data_path)

In [329]:
# checking if two dataset has same columns 
flats.columns.tolist() == houses.columns.tolist()

True

In [330]:
# concatinating the two datasets
df = pd.concat([flats, houses], ignore_index=True)

In [331]:
# checking few sample rows from cancatinated dataset
df.sample(5)

Unnamed: 0,property_name,property_type,society,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating
2969,3 BHK Flat in Sector 37C Gurgaon,flat,corona optus,1.5,6361.0,2358.0,Super Built up area 2358(219.07 sq.m.),3.0,4,3+,"servant room,others","Tower J, 14th Floor, Sector 37C Gurgaon, Gurga...",14.0,North-East,0 to 1 Year Old,"['The Esplanade Mall', 'Gurugram Road', 'Delhi...",Property is a specious park facing and north f...,"['3 Wardrobe', '7 Fan', '3 Geyser', '15 Light'...","['Security / Fire Alarm', 'Intercom Facility',...","['Management4 out of 5', 'Green Area5 out of 5..."
39,3 BHK Flat in Sector 43 Gurgaon,flat,ansal sushant lok ci,2.0,15151.0,1320.0,Carpet area: 1320 (122.63 sq.m.),3.0,3,2,servant room,"C 488,1 St Floor, Sector 43 Gurgaon, Gurgaon, ...",1.0,North,10+ Year Old,"['Huda city centre metro station', 'Sector 42-...","Its huge inside,the bebrooms are quote spaciou...","['3 Bed', '5 Wardrobe', '6 Fan', '1 Sofa', '1 ...","['Water Storage', 'Park']","['Environment4 out of 5', 'Safety4 out of 5', ..."
1255,2 BHK Flat in Sector 84 Gurgaon,flat,pivotal devaan,0.34,7039.0,483.0,Carpet area: 483 (44.87 sq.m.),2.0,2,1,store room,"108, Sector 84 Gurgaon, Gurgaon, Haryana",1.0,East,1 to 5 Year Old,"['Dwarka Expressway', 'RPS International Schoo...",Pivotal devaan pivotal infrastructure propelli...,"['1 Wardrobe', '3 Fan', '1 Exhaust Fan', '5 Li...","['Security / Fire Alarm', 'Power Back-up', 'Li...","['Environment4 out of 5', 'Lifestyle4 out of 5..."
3657,5 Bedroom House for sale in Sector 50 Gurgaon,house,ss hibiscus,12.25,20940.0,5850.0,Plot area 5850(543.48 sq.m.),5.0,7,3+,"pooja room,study room,servant room,others","A-23, Sector 50 Gurgaon, Gurgaon, Haryana",1.0,South-East,1 to 5 Year Old,"['Baani Square', 'SS Plaza', 'Good Earth City ...",Rare inventory \nSs hibiscus villa \n650 yards...,"['1 Water Purifier', '1 Fan', '1 Exhaust Fan',...","['Centrally Air Conditioned', 'Water purifier'...","['Environment5 out of 5', 'Lifestyle5 out of 5..."
3795,4 Bedroom House for sale in Sushant Lok Phase 3,house,independent,3.5,21605.0,1620.0,Plot area 180(150.5 sq.m.),4.0,4,3+,store room,"78787, Sushant Lok Phase 3, Gurgaon, Haryana",3.0,South,10+ Year Old,"['Radhakrishna Shani Mandir', 'Sanatan Dharm M...",Ansal florence villa in 180yds is available fo...,"['6 Wardrobe', '8 Fan', '1 Exhaust Fan', '4 Ge...","['High Ceiling Height', 'Maintenance Staff', '...","['Environment4 out of 5', 'Safety4 out of 5', ..."


In [332]:
# shuffling the entire concatinated dataset
df = df.sample(df.shape[0], ignore_index=True)

In [333]:
# changing the pandas setting to see all the rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [334]:
# checking the shape of the dataset
print(f"Shape of the dataset: {df.shape}")

Shape of the dataset: (3951, 20)


In [335]:
# check for duplicate rows
print(f"Total duplicate rows in the dataset: {df.duplicated().sum()}")

Total duplicate rows in the dataset: 0


In [336]:
# missing values in the dataset
print(f"Missing values in the dataset:\n{df.isnull().sum().sort_values(ascending=False)}")

Missing values in the dataset:
facing             1170
furnishDetails     1032
features            705
rating              448
nearbyLocations     205
price                20
price_per_sqft       20
area                 20
floorNum             19
address              11
agePossession         1
description           0
property_name         0
property_type         0
balcony               0
bathroom              0
bedRoom               0
areaWithType          0
society               0
additionalRoom        0
dtype: int64


In [337]:
# taking a brief view of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_name    3951 non-null   object 
 1   property_type    3951 non-null   object 
 2   society          3951 non-null   object 
 3   price            3931 non-null   float64
 4   price_per_sqft   3931 non-null   float64
 5   area             3931 non-null   float64
 6   areaWithType     3951 non-null   object 
 7   bedRoom          3951 non-null   float64
 8   bathroom         3951 non-null   int64  
 9   balcony          3951 non-null   object 
 10  additionalRoom   3951 non-null   object 
 11  address          3940 non-null   object 
 12  floorNum         3932 non-null   float64
 13  facing           2781 non-null   object 
 14  agePossession    3950 non-null   object 
 15  nearbyLocations  3746 non-null   object 
 16  description      3951 non-null   object 
 17  furnishDetails

In [338]:
# adding 'sector' column in the dataset for sector information
df.insert(loc=3, column='sector', value=df['property_name'].str.split('in').str.get(1).str.replace('Gurgaon', '').str.strip())

In [339]:
# checking few rows from the dataset
df.sample(5)

Unnamed: 0,property_name,property_type,society,sector,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating
3357,4 BHK Flat in Golf Course Ext Road,flat,ansals api esencia,Golf Course Ext Road,2.4,8000.0,3000.0,Built Up area: 2500 (232.26 sq.m.)Carpet area:...,4.0,4,3+,not available,"D Block, Ansal Esencia, Golf Course Ext Road ,...",1.0,East,0 to 1 Year Old,"['Sri Radhe Krishna Temple', 'Kamal Hospital a...","418 sq yard, 2200 sq feet, east facing, 4 bhk,...","['1 Exhaust Fan', '4 Geyser', '6 Light', '6 AC...","['Security / Fire Alarm', 'Intercom Facility',...",
1636,4 BHK Flat in Gwal Pahari,flat,paras quartier,Gwal Pahari,5.9,11028.0,5350.0,Built Up area: 5350 (497.03 sq.m.),4.0,4,3+,not available,"Paras Quartier, Valley View Estate, Gwal Pahar...",0.0,,5 to 10 Year Old,"['Sector 55-56 Metro Station', 'South point Ma...","New apartment, lush green aravalli valley 270 ...","['1 Light', 'No AC', 'No Bed', 'No Chimney', '...",,
3304,4 BHK Flat in Sector 109 Gurgaon,flat,chintels paradiso,Sector 109,1.97,7490.0,2630.0,Super Built up area 2630(244.33 sq.m.),4.0,4,2,"pooja room,servant room","Sector 109 Gurgaon, Gurgaon, Haryana",11.0,North-East,1 to 5 Year Old,"['Gyaananda School', 'The NorthCap University'...",It is a spacious apartment in premium tower wi...,"['6 AC', '1 Chimney', '1 Modular Kitchen', 'No...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Green Area5 out of 5', 'Construction4 out of..."
851,2 BHK Flat in Sector 7 Gurgaon,flat,project krishna colony,Sector 7,0.36,5142.0,700.0,Built Up area: 700 (65.03 sq.m.),2.0,2,1,not available,"Sector 7 Gurgaon, Gurgaon, Haryana",0.0,,1 to 5 Year Old,"['State bank ATM', 'Dr. Madan Clinic', 'Taneja...",Residential apartment for sell.Located in sect...,,,"['Environment4 out of 5', 'Safety4 out of 5', ..."
3062,2 BHK Flat in Sector 69 Gurgaon,flat,surendra avenue 69,Sector 69,0.75,6818.0,1100.0,Built Up area: 1100 (102.19 sq.m.),2.0,2,2,not available,"Sector 69 Gurgaon, Gurgaon, Haryana",4.0,,5 to 10 Year Old,"['Sri Radhe Krishna Temple', 'Icici bank ATM',...",Residential apartment for sell.Located in sect...,"['1 Bed', '1 Wardrobe', '1 Fan', '1 Sofa', '1 ...",,"['Environment4 out of 5', 'Lifestyle4 out of 5..."


In [340]:
# lowering the cases in 'sector' column
df['sector'] = df['sector'].str.lower()

In [341]:
# replacing the non-sector values with sector values

df['sector'] = df['sector'].str.replace('dharam colony','sector 12')
df['sector'] = df['sector'].str.replace('krishna colony','sector 7')
df['sector'] = df['sector'].str.replace('suncity','sector 54')
df['sector'] = df['sector'].str.replace('prem nagar','sector 13')
df['sector'] = df['sector'].str.replace('mg road','sector 28')
df['sector'] = df['sector'].str.replace('gandhi nagar','sector 28')
df['sector'] = df['sector'].str.replace('laxmi garden','sector 11')
df['sector'] = df['sector'].str.replace('shakti nagar','sector 11')

df['sector'] = df['sector'].str.replace('baldev nagar','sector 7')
df['sector'] = df['sector'].str.replace('shivpuri','sector 7')
df['sector'] = df['sector'].str.replace('garhi harsaru','sector 17')
df['sector'] = df['sector'].str.replace('imt manesar','manesar')
df['sector'] = df['sector'].str.replace('adarsh nagar','sector 12')
df['sector'] = df['sector'].str.replace('shivaji nagar','sector 11')
df['sector'] = df['sector'].str.replace('bhim nagar','sector 6')
df['sector'] = df['sector'].str.replace('madanpuri','sector 7')

df['sector'] = df['sector'].str.replace('saraswati vihar','sector 28')
df['sector'] = df['sector'].str.replace('arjun nagar','sector 8')
df['sector'] = df['sector'].str.replace('ravi nagar','sector 9')
df['sector'] = df['sector'].str.replace('vishnu garden','sector 105')
df['sector'] = df['sector'].str.replace('bhondsi','sector 11')
df['sector'] = df['sector'].str.replace('surya vihar','sector 21')
df['sector'] = df['sector'].str.replace('devilal colony','sector 9')
df['sector'] = df['sector'].str.replace('valley view estate','gwal pahari')

df['sector'] = df['sector'].str.replace('mehrauli  road','sector 14')
df['sector'] = df['sector'].str.replace('jyoti park','sector 7')
df['sector'] = df['sector'].str.replace('ansal plaza','sector 23')
df['sector'] = df['sector'].str.replace('dayanand colony','sector 6')
df['sector'] = df['sector'].str.replace('sushant lok phase 2','sector 55')
df['sector'] = df['sector'].str.replace('chakkarpur','sector 28')
df['sector'] = df['sector'].str.replace('greenwood city','sector 45')
df['sector'] = df['sector'].str.replace('subhash nagar','sector 12')

df['sector'] = df['sector'].str.replace('sohna road road','sohna road')
df['sector'] = df['sector'].str.replace('malibu town','sector 47')
df['sector'] = df['sector'].str.replace('surat nagar 1','sector 104')
df['sector'] = df['sector'].str.replace('new colony','sector 7')
df['sector'] = df['sector'].str.replace('mianwali colony','sector 12')
df['sector'] = df['sector'].str.replace('jacobpura','sector 12')
df['sector'] = df['sector'].str.replace('rajiv nagar','sector 13')
df['sector'] = df['sector'].str.replace('ashok vihar','sector 3')

df['sector'] = df['sector'].str.replace('dlf phase 1','sector 26')
df['sector'] = df['sector'].str.replace('nirvana country','sector 50')
df['sector'] = df['sector'].str.replace('palam vihar','sector 2')
df['sector'] = df['sector'].str.replace('dlf phase 2','sector 25')
df['sector'] = df['sector'].str.replace('sushant lok phase 1','sector 43')
df['sector'] = df['sector'].str.replace('laxman vihar','sector 4')
df['sector'] = df['sector'].str.replace('dlf phase 4','sector 28')
df['sector'] = df['sector'].str.replace('dlf phase 3','sector 24')

df['sector'] = df['sector'].str.replace('sushant lok phase 3','sector 57')
df['sector'] = df['sector'].str.replace('dlf phase 5','sector 43')
df['sector'] = df['sector'].str.replace('rajendra park','sector 105')
df['sector'] = df['sector'].str.replace('uppals southend','sector 49')
df['sector'] = df['sector'].str.replace('sohna','sohna road')
df['sector'] = df['sector'].str.replace('ashok vihar phase 3 extension','sector 5')
df['sector'] = df['sector'].str.replace('south city 1','sector 41')
df['sector'] = df['sector'].str.replace('ashok vihar phase 2','sector 5')

In [342]:
# considering only sectors which appeared more that 3 times
a = df['sector'].value_counts()[df['sector'].value_counts() >= 3]
df = df[df['sector'].isin(a.index)]

In [343]:
# checking the unique sectors in the dataset
df.sector.value_counts()

sector
sohna road                    163
sector 102                    113
sector 85                     110
sector 92                     102
sector 69                      94
sector 65                      90
sector 90                      90
sector 81                      90
sector 109                     88
sector 79                      80
sector 104                     73
sector 83                      69
sector 37d                     68
sector 86                      67
sector 50                      65
sector 107                     60
sector 108                     59
sector 56                      57
sector 95                      57
sector 89                      56
sector 48                      56
sector 2                       55
sector 70a                     54
sector 70                      53
sector 43                      53
sector 37c                     53
sector 84                      51
sector-33 sohna road           51
sector 26                      50
sector 

In [344]:
# replacing the non-sector values with sector values

df['sector'] = df['sector'].str.replace('sector 95a','sector 95')
df['sector'] = df['sector'].str.replace('sector 23a','sector 23')
df['sector'] = df['sector'].str.replace('sector 12a','sector 12')
df['sector'] = df['sector'].str.replace('sector 3a','sector 3')
df['sector'] = df['sector'].str.replace('sector 110 a','sector 110')
df['sector'] = df['sector'].str.replace('patel nagar','sector 15')
df['sector'] = df['sector'].str.replace('a block sector 43','sector 43')
df['sector'] = df['sector'].str.replace('maruti kunj','sector 12')
df['sector'] = df['sector'].str.replace('b block sector 43','sector 43')

df['sector'] = df['sector'].str.replace('sector-33 sohna road','sector 33')
df['sector'] = df['sector'].str.replace('sector 1 manesar','manesar')
df['sector'] = df['sector'].str.replace('sector 4 phase 2','sector 4')
df['sector'] = df['sector'].str.replace('sector 1a manesar','manesar')
df['sector'] = df['sector'].str.replace('c block sector 43','sector 43')
df['sector'] = df['sector'].str.replace('sector 89 a','sector 89')
df['sector'] = df['sector'].str.replace('sector 2 extension','sector 2')
df['sector'] = df['sector'].str.replace('sector 36 sohna road','sector 36')

In [345]:
for i in df.sector.value_counts().index.tolist():
    if 'sector' not in str(i):
        print(i)

sohna road
manesar
gwal pahari
sohna road road
dwarka expressway
new


In [346]:
df[df['sector']=='new']

Unnamed: 0,property_name,property_type,society,sector,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating
29,2 BHK Flat in New Gurgaon,flat,green court,new,0.38,5507.0,690.0,Carpet area: 690 (64.1 sq.m.),2.0,2,1,not available,"New Gurgaon, Gurgaon, Haryana",7.0,,Under Construction,"['Ing bank ATM', 'Dcb bank ATM', 'Indus ind ba...",We are the proud owners of this 2 bhk apartmen...,[],"['Intercom Facility', 'Lift(s)', 'Maintenance ...",
762,2 BHK Flat in New Gurgaon,flat,takshila heights sector 37 c,new,0.67,5583.0,1200.0,Super Built up area 1200(111.48 sq.m.),2.0,2,2,not available,"New Gurgaon, Gurgaon, Haryana",3.0,,1 to 5 Year Old,"['Shri Balaji Hospital and Trauma Center', 'S....",Check out this 2 bhk apartment for sale in tak...,[],"['Lift(s)', 'Swimming Pool', 'Visitor Parking'...",
1169,4 BHK Flat in New Gurgaon,flat,dlf 76,new,4.0,11428.0,3500.0,Carpet area: 3500 (325.16 sq.m.),4.0,4,2,"study room,servant room","New Gurgaon, Gurgaon, Haryana",4.0,,Jun 2027,"['Shri Balaji Hospital and Trauma Center', 'S....",This lovely 4 bhk apartment/flat in new gurgao...,"['6 Wardrobe', '1 Fridge', '8 Fan', '1 Exhaust...","['Security / Fire Alarm', 'Feng Shui / Vaastu ...",
3034,4 BHK Flat in New Gurgaon,flat,sare homes,new,0.85,4786.0,1776.0,Super Built up area 1776(165 sq.m.),4.0,4,3,not available,"New Gurgaon, Gurgaon, Haryana",3.0,,5 to 10 Year Old,"['Columbia Asia Hospital', 'Apex Multi Special...",Located in the popular residential address of ...,[],,


In [347]:
df['sector'] = df['sector'].str.replace('sohna road','sector 2')
df['sector'] = df['sector'].str.replace('manesar','sector 1')
df['sector'] = df['sector'].str.replace('gwal pahari','sector 2')
df['sector'] = df['sector'].str.replace('sohna road road','sector 2')
df['sector'] = df['sector'].str.replace('dwarka expressway','sector 26')

In [348]:
df.loc[df[(df['sector']=='new') & (df['society'] == 'dlf 76')].index[0], 'sector'] = 'sector76'
df.loc[df[(df['sector']=='new') & (df['society'] == 'takshila heights sector 37 c')].index[0], 'sector'] = 'sector 37'
df.loc[df[(df['sector']=='new') & (df['society'] == 'green court')].index[0], 'sector'] = 'sector 90'
df.loc[df[(df['sector']=='new') & (df['society'] == 'sare homes')].index[0], 'sector'] = 'sector 92'

In [349]:
# checking for any non-sector values in 'sector' column
for i in df.sector.value_counts().index.tolist():
    if 'sector' not in str(i):
        print(i)

In [350]:
# checking if any sector value doesn't starts with 'sector'
[i for i in df.sector.value_counts().index if not str(i).startswith('sector')]

['new sector 2']

In [351]:
# checking the rows where sector name is 'new sector 2'
df[df['sector'] =='new sector 2']

Unnamed: 0,property_name,property_type,society,sector,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating
349,3 Bedroom House for sale in New Palam Vihar,house,independent,new sector 2,1.0,8796.0,1137.0,Plot area 120(100.34 sq.m.)Built Up area: 120 ...,3.0,2,2,pooja room,"Q-148, New Palam Vihar, Phase-2, Near Royal Oa...",1.0,North,10+ Year Old,"['Palam Vihar Vyapar kendra', 'Palam triangle'...","Ground and first floor, Ground floor: Ground f...",,,"['Environment4 out of 5', 'Lifestyle4 out of 5..."
2754,2 BHK Flat in New Palam Vihar,flat,ompee k s residency,new sector 2,1.6,26936.0,594.0,Carpet area: 66 (55.18 sq.m.),2.0,2,2,not available,"New Palam Vihar, Gurgaon, Haryana",1.0,,1 to 5 Year Old,"['Palam Vihar Vyapar kendra', 'Palam triangle'...",We are the proud owners of this 2 bhk apartmen...,,,"['Environment4 out of 5', 'Safety4 out of 5', ..."
2820,2 Bedroom House for sale in New Palam Vihar,house,my home,new sector 2,0.34,12592.0,270.0,Plot area 270(25.08 sq.m.),2.0,2,2,not available,"Ez-19 A, New Palam Vihar, Gurgaon, Haryana",3.0,West,5 to 10 Year Old,"['Palam Vihar Vyapar kendra', 'Palam triangle'...",There are availability of various facilities l...,"['1 Wardrobe', '3 Fan', '6 Light', 'No AC', 'N...","['Water Storage', 'Park', 'Visitor Parking']","['Environment4 out of 5', 'Lifestyle4 out of 5..."


In [352]:
# replacing the 'new sector 2' with 'sector 2'
df.loc[df['sector']=='new sector 2', 'sector'] = 'sector 2'

In [353]:
# updating the below sector values to in proper format
# sector 2 road 
# sector 3 phase 3 extension
# sector 3 phase 2
# sector76 

df.loc[df['sector'] == 'sector 2 road', 'sector'] = 'sector 2'
df.loc[df['sector'] == 'sector 3 phase 3 extension', 'sector'] = 'sector 3'
df.loc[df['sector'] == 'sector 3 phase 2', 'sector'] = 'sector 3'
df.loc[df['sector'] == 'sector76', 'sector'] = 'sector 76'

In [354]:
# checking the values counts to check the correct name formatting the in 'sector' column
df.sector.value_counts()

sector
sector 2      257
sector 102    113
sector 85     110
sector 92     103
sector 69      94
sector 90      91
sector 81      90
sector 65      90
sector 109     88
sector 79      80
sector 33      74
sector 104     73
sector 83      69
sector 37d     68
sector 86      67
sector 43      67
sector 95      66
sector 50      65
sector 89      64
sector 107     60
sector 108     59
sector 56      57
sector 26      57
sector 48      56
sector 70a     54
sector 70      53
sector 37c     53
sector 84      51
sector 49      48
sector 67      48
sector 4       48
sector 28      47
sector 66      46
sector 3       44
sector 103     44
sector 25      43
sector 113     43
sector 61      42
sector 82      42
sector 7       39
sector 1       39
sector 106     38
sector 12      37
sector 99a     32
sector 54      31
sector 68      30
sector 72      29
sector 77      29
sector 71      27
sector 112     24
sector 57      24
sector 24      24
sector 14      23
sector 88a     23
sector 105     23
sec

In [355]:
# checking the shape of the dataset
print(f"Shape of the dataset: {df.shape}")

Shape of the dataset: (3793, 21)


In [356]:
df.columns

Index(['property_name', 'property_type', 'society', 'sector', 'price',
       'price_per_sqft', 'area', 'areaWithType', 'bedRoom', 'bathroom',
       'balcony', 'additionalRoom', 'address', 'floorNum', 'facing',
       'agePossession', 'nearbyLocations', 'description', 'furnishDetails',
       'features', 'rating'],
      dtype='object')

In [357]:
# dropping some feature, which we will no need in further analysis
dropping_columns = ['property_name', 'address', 'description', 'rating']

df.drop(columns = dropping_columns, inplace=True)

In [358]:
# checking few rows from the dataset
df.sample(5)

Unnamed: 0,property_type,society,sector,price,price_per_sqft,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,floorNum,facing,agePossession,nearbyLocations,furnishDetails,features
603,flat,rof ananda,sector 95,0.21,61.0,34426.0,Carpet area: 34401 (3195.96 sq.m.),1.0,1,1,not available,13.0,North,1 to 5 Year Old,"['Metro', 'Dwarka Expressway', 'Rajeev Chowk',...",,"['Security / Fire Alarm', 'Intercom Facility',..."
2598,house,dlf city plots phase 2,sector 25,10.0,400000.0,250.0,Plot area 250(23.23 sq.m.),12.0,12,3+,"study room,servant room",4.0,North,1 to 5 Year Old,"['Vodafone belvedere towers metro station', 'D...","['24 Fan', '1 Exhaust Fan', '12 Geyser', '1 St...","['Security / Fire Alarm', 'Feng Shui / Vaastu ..."
3499,house,independent,sector 11,2.1,17284.0,1215.0,Plot area 135(112.88 sq.m.),6.0,6,3+,not available,3.0,,1 to 5 Year Old,"['Rajiv Chowk Mosque', 'Rachna Dental Clinic',...","['3 Chimney', 'No AC', 'No Bed', 'No Curtains'...","['Water Storage', 'Visitor Parking', 'Waste Di..."
3249,flat,parsvnath green ville,sector 48,3.3,9984.0,3305.0,Super Built up area 3905(362.79 sq.m.)Built Up...,5.0,5,3+,servant room,4.0,,10+ Year Old,"['Sri Radhe Krishna Temple', 'Icici bank ATM',...",,"['Security / Fire Alarm', 'Private Garden / Te..."
1037,flat,emaar mgf the palm drive,sector 66,2.85,14615.0,1950.0,Super Built up area 1950(181.16 sq.m.)Built Up...,3.0,3,3,study room,11.0,North-East,1 to 5 Year Old,"['Sector 55-56 Rapid Metro Station', 'HUB 66',...","['3 Wardrobe', '5 Fan', '3 Geyser', '10 Light'...","['Power Back-up', 'Intercom Facility', 'Lift(s..."


In [359]:
# saving the file for further feature engineering

# saving file name 
file_name = interim_data_dir / 'flat-house-dataset.csv'

df.to_csv(file_name, index=False)