In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option("display.max_columns",None)

In [None]:
df = pd.read_csv("../Dataset/gurgaon_properties_cleaned_v2.csv")

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.head()

## Property_type

In [None]:
df["property_type"].value_counts()

In [None]:
df["property_type"].isnull().sum()

In [None]:
df["property_type"].value_counts().plot(kind="bar")

Observation
* 75% data is on flat
* There is no missing value

## society

In [None]:
df['society'].isnull().sum()

In [None]:
df['society'].value_counts()

In [None]:
df[df['society'] != 'independent']['society'].value_counts(normalize=True).head(75)

In [None]:
df[df['society'] != 'independent']['society'].value_counts(normalize=True).cumsum().head(75)

In [None]:
df[df['society'] != 'independent']['society'].value_counts().head(10).plot(kind='bar')

In [None]:
society_counts = df['society'].value_counts()

frequency_bin = {
    "Very High": (society_counts > 100).sum(),
    "High": ((society_counts >=50) & (society_counts < 100)).sum(),
    "Average": ((society_counts >=10) & (society_counts < 50)).sum(),
    "Low": ((society_counts >=2) & (society_counts < 10)).sum(),
    "Very Low": (society_counts == 1).sum()
}
frequency_bin

Observation
* only 1 missing value
* 75 society has 50% of listing and rest 600 has other 50% listing 
* maximum society has only 1 listing
* there is 675 society


## sector

In [None]:
df['sector'].value_counts().shape

In [None]:
df['sector'].isnull().sum()

In [None]:
df['sector'].value_counts().head(10).plot(kind="bar")

In [None]:
sector_counts = df['sector'].value_counts()

sector_frequency_bins = {
    "Very High (>100)": (sector_counts > 100).sum(),
    "High (50-100)": ((sector_counts >= 50) & (sector_counts <= 100)).sum(),
    "Average (10-49)": ((sector_counts >= 10) & (sector_counts < 50)).sum(),
    "Low (2-9)": ((sector_counts > 1) & (sector_counts < 10)).sum(),
    "Very Low (1)": (sector_counts == 1).sum()
}
sector_frequency_bins

Observation
* there is too much sector 104 to be exact
* the count seems normal and surprise part is there is sector with 1 listing
* average part is preety good
* there is 0 missing value

In [None]:
df.head()

## price

In [None]:
df['price'].isnull().sum()

In [None]:
df['price'].describe()

In [None]:
sns.histplot(df['price'], kde=True, bins=50)

In [None]:
sns.boxplot(x=df['price'], color='lightgreen')
plt.grid()

In [None]:
# Skewness and Kurtosis
skewness = df['price'].skew()
kurtosis = df['price'].kurt()

print(skewness,kurtosis)

In [None]:
# Quantile Analysis
quantiles = df['price'].quantile([0.01, 0.05, 0.95, 0.99])

quantiles

In [None]:
# Identify potential outliers using IQR method
Q1 = df['price'].describe()['25%']
Q3 = df['price'].describe()['75%']
IQR = Q3 - Q1

print(IQR)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound, upper_bound)

In [None]:
outliers = df[(df['price'] < lower_bound) | (df['price'] > upper_bound)]
outliers.shape

In [None]:
outliers['price'].describe()

Observation
* there is 17 null values
* data is right skewed and lot of outliers
* from histogram most of the property price is less than 5 cr
* median value is 1.52cr
* 95% property has below 8.5cr price
* minimum price is 70lakh and maximum is 31.5cr
* according to IQR 425 values are outlier

In [None]:
bins = [0, 1, 2, 3, 5, 10, 20, 50]
bin_labels = ["0-1", "1-2", "2-3", "3-5", "5-10", "10-20", "20-50"]
pd.cut(df['price'], bins=bins, labels=bin_labels, right=False).value_counts().sort_index().plot(kind='bar')

In [None]:
plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.histplot(df['price'], kde=True, bins=50, color='skyblue')
plt.title('Distribution of Prices (Original)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.histplot(np.log1p(df['price']), kde=True, bins=50, color='lightgreen')
plt.title('Distribution of Prices (Log Transformed)')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:

plt.figure(figsize=(15, 6))

# Distribution plot without log transformation
plt.subplot(1, 2, 1)
sns.boxplot(x = df['price'], color='skyblue')
plt.title('Distribution of Prices (Original)')
plt.xlabel('Price (in Crores)')
plt.ylabel('Frequency')

# Distribution plot with log transformation
plt.subplot(1, 2, 2)
sns.boxplot(x = np.log1p(df['price']), color='lightgreen')
plt.title('Distribution of Prices (Log Transformed)')
plt.xlabel('Log(Price)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

Observation
* after doing binning found out most the property price is around 2-3 cr and below. after 5cr there is a drastic down
* log value is helping in Normalize the distribution but only log will result in -ve value since price can't be -v so take log1p
  

## price_per_sqft

In [None]:
df['price_per_sqft'].describe()

In [None]:
df['price_per_sqft'].isnull().sum()

In [None]:
sns.histplot(df['price_per_sqft'], kde=True, bins=50)

In [None]:
plt.hist(df['price_per_sqft'], bins=50, range=(0, 50000), edgecolor='black')
plt.show()

In [None]:
sns.boxplot(x = df['price_per_sqft'], color = "lightgreen")

Observation
* there is 17b missing value
* surprisingly the minimum price is 4 rupees and median is 9020 which is good
* so many outliers
* maximum price ranges from 0 to 20,000

## bedroom

In [None]:
df['bedRoom'].isnull().sum()

In [None]:
df['bedRoom'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['bedRoom'].value_counts(normalize=True).head(5).plot(kind='pie',autopct='%0.2f%%')

Observation
* maximum property has 2,3 bedroom and there is one property with 16 bedroom
* there is no missing value

## bathroom

In [None]:
df['bathroom'].isnull().sum()

In [None]:
df['bathroom'].value_counts()

In [None]:
df['bathroom'].value_counts().head(10).plot(kind="bar")

In [None]:
df['bathroom'].value_counts(normalize=True).head(5).plot(kind='pie',autopct='%0.2f%%')

Observation
* same as bedroom

## balcony

In [None]:
df['balcony'].isnull().sum()

In [None]:
df['balcony'].value_counts()

In [None]:
df['balcony'].value_counts().plot(kind='bar')

In [None]:
df['balcony'].value_counts(normalize=True).plot(kind='pie',autopct='%0.2f%%')

Observation
* there is no missing value
* there is 3+ balcony

In [None]:
df.head()

## floor number

In [None]:
df['floorNum'].isnull().sum()

In [None]:
df['floorNum'].describe()

In [None]:
df['floorNum'].value_counts().sort_index().plot(kind='bar')

In [None]:
sns.boxplot(x = df['floorNum'], color='lightgreen')

In [None]:
df[df['floorNum'] >40]

Observation
* there is 50% property bwteen 0-5th floor.
* floor 2nd and 3rd are preety common
* The box plot reveals that the majority of the properties are concentrated around the lower floors. The interquartile range (IQR) lies between approximately the 2nd and 10th floors.
* there are few property with higher floor > 25

## facing

In [None]:
df['facing'].isnull().sum()

In [None]:
df['facing'].fillna('NA',inplace=True)

In [None]:
df['facing'].value_counts()

Observation
* there is a lot of missing values
* facing can be used to check the price if specific direction facing property has higher price or not 

## agePosession

In [None]:
df['agePossession'].isnull().sum()

In [None]:
df['agePossession'].value_counts()

In [None]:
df['agePossession'].value_counts().plot(kind="bar")

Observation
* there is no null value.
* most of property are newly built

## super built up area

In [None]:
df['super_built_up_area'].isnull().sum()

In [None]:
df['super_built_up_area'].describe()

In [None]:
sns.histplot(df['super_built_up_area'], bins=50, color='skyblue', kde=True)

In [None]:
sns.boxplot(x = df['super_built_up_area'], color='skyblue')

Observation
* Most properties have a super built-up area ranging between approximately 1,000 sq.ft and 2,500 sq.ft.
* There are a few properties with a significantly larger area, leading to a right-skewed distribution.
* The interquartile range (IQR) lies between roughly 1,480 sq.ft and 2,215 sq.ft, indicating that the middle 50% of the properties fall within this range.
* There are several data points beyond the upper "whisker" of the box plot, indicating potential outliers. These are properties with an unusually large super built-up area.

## built up area

In [None]:
df['built_up_area'].isnull().sum()

In [None]:
df['built_up_area'].describe()

In [None]:
sns.histplot(df['built_up_area'], bins=50, color='skyblue', kde=False)

In [None]:
sns.boxplot(x = df['built_up_area'], color='lightgreen')

## Carpet area

In [None]:
df['carpet_area'].isnull().sum()

In [None]:
df['carpet_area'].describe()

In [None]:
sns.histplot(df['carpet_area'], bins=50, color='skyblue', kde=False)

In [None]:
sns.boxplot(x = df['carpet_area'], color='lightgreen')

Observation
* Most properties have a built-up area ranging roughly between 500 sq.ft and 3,500 sq.ft.* 
There are very few properties with a much larger built-up area, leading to a highly right-skewed distribution
* 
The box plot confirms the presence of significant outliers on the higher side. The data's interquartile range (IQR) is relatively compact, but the "whiskers" of the box plot are stretched due to the outlier
* .
The presence of extreme values, especially on the higher side, suggests that there may be outliers or data errors. This could also be due to some properties being exceptionally large, like a commercial complex or an entire building being listed.

In [None]:
df.head()

## additional room

In [None]:
plt.figure(figsize=(20, 12))

# Create a subplot of pie charts for each room type and 1 will start index from 1 instead of 0
for idx, room in enumerate(['study room','servant room','store room','pooja room','others'], 1):
    ax = plt.subplot(2, 3, idx)
    df[room].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, ax=ax)
    plt.title(f'Distribution of {room.title()}')
    plt.ylabel('')

plt.tight_layout()
plt.show()

## furnishing type

In [None]:
df['furnishing_type'].value_counts()

In [None]:
df['furnishing_type'].value_counts().plot(kind='pie',autopct='%0.2f%%')

In [None]:
df['furnishing_type'].value_counts().plot(kind='bar')

Observation
* most property are frunished

## luxury score

In [None]:
df['luxury_score'].isnull().sum()

In [None]:
df['luxury_score'].describe()

In [None]:
sns.histplot(df['luxury_score'], bins=50, color='skyblue', kde=True)

In [None]:
sns.boxplot(x = df['luxury_score'], color='lightgreen')

# Pandas profilling

In [None]:
# Create the ProfileReport object
from pandas_profiling import ProfileReport
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)

# Generate the report
profile.to_file("output_report.html")