In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)
sns.set_palette("husl")

In [None]:
df = pd.read_csv("../Dataset/gurgaon_properties_cleaned_v2.csv").drop_duplicates()

In [None]:
df.head()

## There are 2 ways to handle outliers(numerical col).
1. Normally distributed columns. (mean +- 3*std)
2. It is not normally distributed. (box plot)

In [None]:
df.shape

## Price

In [None]:
# outliers on the basis of price column
sns.distplot(df['price'])

In [None]:
sns.boxplot(x=df['price'])

In [None]:
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]

# Displaying the number of outliers and some statistics
num_outliers = outliers.shape[0]
outliers_price_stats = outliers['price'].describe()

num_outliers, outliers_price_stats

In [None]:
outliers.sort_values('price',ascending=False).head(20)

## on the basis of price col we can say that there are some genuine outliers but there are some data erros as well

## price_per_sq

In [None]:
sns.distplot(df['price_per_sqft'])

In [None]:
sns.boxplot(x=df['price_per_sqft'])

In [None]:
Q1 = df['price_per_sqft'].quantile(0.25)
Q3 = df['price_per_sqft'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers_sqft = df[(df['price_per_sqft'] < lower_bound) | (df['price_per_sqft'] > upper_bound)]

# Displaying the number of outliers and some statistics
num_outliers = outliers_sqft.shape[0]
outliers_sqft_stats = outliers_sqft['price_per_sqft'].describe()

num_outliers, outliers_sqft_stats

In [None]:
outliers_sqft.head()

## there is some calculation error the area < 1000 is not in sq.ft. it is in sq.yard 

In [None]:
outliers_sqft['area'] = outliers_sqft['area'].apply(lambda x:x*9 if x<1000 else x)

In [None]:
outliers_sqft['price_per_sqft'] = round((outliers_sqft['price']*10000000)/outliers_sqft['area'])

In [None]:
outliers_sqft['price_per_sqft'].describe()

In [None]:
df.update(outliers_sqft)

In [None]:
sns.distplot(df['price_per_sqft'])

In [None]:
sns.boxplot(x=df['price_per_sqft'])

In [None]:
df[df['price_per_sqft']>50000]

## droping all these rows, since it is too small

In [None]:
df = df[df['price_per_sqft'] <= 50000]

In [None]:
sns.boxplot(x=df['price_per_sqft'])

## area

In [None]:
sns.distplot(df['area'])

In [None]:
sns.boxplot(x=df['area'])

In [None]:
df['area'].describe()

In [None]:
Q1 = df['area'].quantile(0.25)
Q3 = df['area'].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers_area = df[(df['area'] < lower_bound) | (df['area'] > upper_bound)]

# Displaying the number of outliers and some statistics
num_outliers = outliers_area.shape[0]
outliers_area_stats = outliers_sqft['area'].describe()

num_outliers, outliers_area_stats

In [None]:
df[df['area'] > 100000]

In [None]:
df = df[df['area'] < 100000]

In [None]:
sns.distplot(df['area'])

In [None]:
df[df['area'] > 10000].sort_values('area',ascending=False)

In [None]:
df.drop(index=[818, 1796, 1123, 2, 2356, 115, 3649, 2503, 1471], inplace=True)

In [None]:
df.loc[48,'area'] = 115*9
df.loc[300,'area'] = 7250
df.loc[2666,'area'] = 5800
df.loc[1358,'area'] = 2660
df.loc[3195,'area'] = 2850
df.loc[2131,'area'] = 1812
df.loc[3088,'area'] = 2160
df.loc[3444,'area'] = 1175

In [None]:
sns.distplot(df['area'])

In [None]:
sns.boxplot(x=df['area'])

In [None]:
df['area'].describe()

## bedroom

In [None]:
sns.distplot(df['bedRoom'])

In [None]:
sns.boxplot(x=df['bedRoom'])

In [None]:
df = df[df['bedRoom'] <= 10]

In [None]:
df.shape

In [None]:
sns.distplot(df['bedRoom'])

In [None]:
sns.boxplot(x=df['bedRoom'])

In [None]:
df['bedRoom'].describe()

## Bathroom

In [None]:
sns.distplot(df['bathroom'])

In [None]:
sns.boxplot(x=df['bathroom'])

In [None]:
df[df['bathroom'] > 10].sort_values('bathroom',ascending=False)

## super built up area


In [None]:
sns.distplot(df['super_built_up_area'])

In [None]:
sns.boxplot(x=df['super_built_up_area'])

In [None]:
df['super_built_up_area'].describe()

In [None]:
df[df['super_built_up_area'] > 6000]

## built up area


In [None]:
sns.distplot(df['built_up_area'])

In [None]:
sns.boxplot(x=df['built_up_area'])

In [None]:
df[df['built_up_area'] > 10000]

## carpet area

In [None]:
sns.distplot(df['carpet_area'])

In [None]:
sns.boxplot(x=df['carpet_area'])

In [None]:
df[df['carpet_area'] > 10000]

In [None]:
sns.distplot(df['luxury_score'])

In [None]:
sns.boxplot(x = df['luxury_score'])

## modify price_per_sqft with updated area value

In [None]:
df['price_per_sqft'] = round((df['price']*10000000)/df['area'])

In [None]:
sns.distplot(df['price_per_sqft'])

In [None]:
sns.boxplot(x = df['price_per_sqft'])

In [None]:
df[df['price_per_sqft'] > 42000]

In [None]:
x = df[df['price_per_sqft'] <= 20000]
# (x['area']/x['bedRoom']).quantile(0.02)
(x['area']/x['bedRoom']).quantile(0.05)

# Strange behaviour `area` (it is in lakh but should be in thousand) and `price_per_sqft`(same here) are giving wrong data and `carpet area` is at high range

In [None]:
df[df["area"]/df["bedRoom"]<250].sample(5)

In [None]:
sns.lmplot(data=df,x="area",y="bedRoom")

In [None]:
df['area_room_ratio'] = df["area"]/df["bedRoom"]

In [None]:
(df[df['area_room_ratio']<250])['bedRoom'].value_counts()

In [None]:
df[df['area_room_ratio']<100].shape

In [None]:
df = df[df['area_room_ratio']>100]

In [None]:
outlier_df = df[(df['area_room_ratio']<250) & (df['bedRoom'] > 3)]

In [None]:
outlier_df['bedRoom'] = round(outlier_df['bedRoom']/outlier_df['floorNum'])

In [None]:
df.update(outlier_df)

In [None]:
df['area_room_ratio'] = df["area"]/df["bedRoom"]

In [None]:
df[(df['area_room_ratio']<250) & (df['bedRoom'] > 4)].head()

In [None]:
df[(df['area_room_ratio']<250) & (df['bedRoom'] > 4)].shape

In [None]:
df = df[~((df['area_room_ratio']<250) & (df['bedRoom'] > 4))]

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.to_csv("../Dataset/gurgaon_properties_outlier_treated.csv")