In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option("display.max_columns",None)
sns.set_style("darkgrid")
sns.set_palette("husl")

In [None]:
df = pd.read_csv("../Dataset/gurgaon_properties_cleaned_v2.csv").drop_duplicates()

In [None]:
df.head()

## property_type vs price

In [None]:
sns.barplot(x=df['property_type'], y=df['price'], estimator=np.median)

In [None]:
sns.boxplot(x=df['property_type'], y=df['price'])

Observation
* House are more costly than flat avg is 4 cr and flat is 1.5 cr

## property_type vs area

In [None]:
sns.barplot(x=df['property_type'], y=df['built_up_area'], estimator=np.median)

In [None]:
sns.boxplot(x=df['property_type'], y=df['built_up_area'])

In [None]:
df[df['built_up_area'] > 700000]

In [None]:
# removing that crazy outlier
df = df[df['built_up_area'] != 737147]

In [None]:
sns.boxplot(x=df['property_type'], y=df['built_up_area'])

Observation
* house has more area as compared to flat

## property_type vs price_per_sqft

In [None]:
sns.barplot(x=df['property_type'], y=df['price_per_sqft'], estimator=np.median)

In [None]:
sns.boxplot(x=df['property_type'], y=df['price_per_sqft'])

In [None]:
df[df['price_per_sqft'] > 100000]

Observation
* price per sq.ft is too high, there is an errro in the convertion

## property_type vs bedroom

In [None]:
sns.heatmap(pd.crosstab(df['property_type'],df['bedRoom']))

In [None]:
sns.barplot(x=df['property_type'],y=df['floorNum'])

In [None]:
sns.boxplot(x=df['property_type'],y=df['floorNum'])

In [None]:
df[(df['property_type'] == 'house') & (df['floorNum'] > 10)]

In [None]:
plt.figure(figsize=(15,4))
sns.heatmap(pd.pivot_table(df,index='property_type',columns='bedRoom',values='price',aggfunc='mean'),annot=True)

Observation
* houses(villa) but in appartments

## property_type vs agepossession

In [None]:
sns.heatmap(pd.crosstab(df['property_type'],df['agePossession']))

In [None]:
sns.heatmap(pd.pivot_table(df,index='property_type',columns='agePossession',values='price',aggfunc='mean'),annot=True)

## property_type vs furnishing_type

In [None]:
sns.heatmap(pd.crosstab(df['property_type'],df['furnishing_type']))

In [None]:
sns.heatmap(pd.pivot_table(df,index='property_type',columns='furnishing_type',values='price',aggfunc='mean'),annot=True)

## property_type vs luxury_score

In [None]:
sns.barplot(x=df['property_type'],y=df['luxury_score'])

In [None]:
sns.boxplot(x=df['property_type'],y=df['luxury_score'])

In [None]:
# sector analysis
plt.figure(figsize=(15,6))
sns.heatmap(pd.crosstab(df['property_type'],df['sector'].sort_index()))

In [None]:
# sector analysis
import re
# Group by 'sector' and calculate the average price
avg_price_per_sector = df.groupby('sector')['price'].mean().reset_index()

# Function to extract sector numbers
def extract_sector_number(sector_name):
    match = re.search(r'\d+', sector_name)
    if match:
        return int(match.group())
    else:
        return float('inf')  # Return a large number for non-numbered sectors

avg_price_per_sector['sector_number'] = avg_price_per_sector['sector'].apply(extract_sector_number)

# Sort by sector number
avg_price_per_sector_sorted_by_sector = avg_price_per_sector.sort_values(by='sector_number')

# Plot the heatmap
plt.figure(figsize=(5, 25))
sns.heatmap(avg_price_per_sector_sorted_by_sector.set_index('sector')[['price']], annot=True, fmt=".2f", linewidths=.5)
plt.title('Average Price per Sector (Sorted by Sector Number)')
plt.xlabel('Average Price')
plt.ylabel('Sector')
plt.show()

In [None]:
avg_price_per_sqft_sector = df.groupby('sector')['price_per_sqft'].mean().reset_index()

avg_price_per_sqft_sector['sector_number'] = avg_price_per_sqft_sector['sector'].apply(extract_sector_number)

# Sort by sector number
avg_price_per_sqft_sector_sorted_by_sector = avg_price_per_sqft_sector.sort_values(by='sector_number')

# Plot the heatmap
plt.figure(figsize=(5, 25))
sns.heatmap(avg_price_per_sqft_sector_sorted_by_sector.set_index('sector')[['price_per_sqft']], annot=True, fmt=".2f", linewidths=.5)
plt.title('Sector (Sorted by Sector Number)')
plt.xlabel('Average Price per sqft')
plt.ylabel('Sector')
plt.show()

In [None]:
luxury_score = df.groupby('sector')['luxury_score'].mean().reset_index()

luxury_score['sector_number'] = luxury_score['sector'].apply(extract_sector_number)

# Sort by sector number
luxury_score_sector = luxury_score.sort_values(by='sector_number')

# Plot the heatmap
plt.figure(figsize=(5, 25))
sns.heatmap(luxury_score_sector.set_index('sector')[['luxury_score']], annot=True, fmt=".2f", linewidths=.5)
plt.title('Sector (Sorted by Sector Number)')
plt.xlabel('Average Price per sqft')
plt.ylabel('Sector')
plt.show()

## price

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = df[df['area']<10000]['area'],y = df['price'],hue=df['bedRoom'])

In [None]:
sns.barplot(x=df['bedRoom'],y=df['price'],estimator=np.median)

In [None]:
sns.barplot(x=df['agePossession'],y=df['price'],estimator=np.median)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.barplot(x=df['furnishing_type'],y=df['price'],estimator=np.median)

In [None]:
numeric_df = df.select_dtypes(include=['number'])

# Create the heatmap
plt.figure(figsize=(8, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.show()


In [None]:
numeric_df = df.select_dtypes(include=['number'])
numeric_df.corr()['price'].sort_values(ascending=False)