In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

# Load the dataset
file_path = "C:\\Users\\aryas\\Downloads\\house_price.csv"
df = pd.read_csv(file_path)

# Basic EDA
print(df.info())  
print(df.head())

# Detecting Outliers using Different Methods

# 1. Mean & Standard Deviation Method
mean_pps = df['price_per_sqft'].mean()
std_pps = df['price_per_sqft'].std()
lower_bound_std = mean_pps - (3 * std_pps)
upper_bound_std = mean_pps + (3 * std_pps)

# 2. Percentile Method
lower_bound_perc, upper_bound_perc = np.percentile(df['price_per_sqft'], [1, 99])

# 3. IQR Method
Q1 = df['price_per_sqft'].quantile(0.25)
Q3 = df['price_per_sqft'].quantile(0.75)
IQR = Q3 - Q1
lower_bound_iqr = Q1 - 1.5 * IQR
upper_bound_iqr = Q3 + 1.5 * IQR

# 4. Z-Score Method
df['z_score'] = (df['price_per_sqft'] - mean_pps) / std_pps
df_no_outliers_z = df[(df['z_score'] > -3) & (df['z_score'] < 3)]

# Applying Outlier Removal
df_no_outliers_std = df[(df['price_per_sqft'] >= lower_bound_std) & (df['price_per_sqft'] <= upper_bound_std)]
df_no_outliers_perc = df[(df['price_per_sqft'] >= lower_bound_perc) & (df['price_per_sqft'] <= upper_bound_perc)]
df_no_outliers_iqr = df[(df['price_per_sqft'] >= lower_bound_iqr) & (df['price_per_sqft'] <= upper_bound_iqr)]

# Checking the number of rows retained after removing outliers
print(len(df_no_outliers_std), len(df_no_outliers_perc), len(df_no_outliers_iqr), len(df_no_outliers_z))

# Box Plot to Compare Outlier Removal Methods
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

sns.boxplot(y=df['price_per_sqft'], ax=axes[0, 0])
axes[0, 0].set_title('Original Data')

sns.boxplot(y=df_no_outliers_std['price_per_sqft'], ax=axes[0, 1])
axes[0, 1].set_title('Mean & Standard Deviation Method')

sns.boxplot(y=df_no_outliers_perc['price_per_sqft'], ax=axes[1, 0])
axes[1, 0].set_title('Percentile Method')

sns.boxplot(y=df_no_outliers_iqr['price_per_sqft'], ax=axes[1, 1])
axes[1, 1].set_title('IQR Method')

plt.tight_layout()
plt.show()

# Normality Check - Histogram Before Transformation
plt.figure(figsize=(10, 5))
sns.histplot(df['price_per_sqft'], bins=50, kde=True)
plt.title('Histogram of Price Per Sqft (Original)')
plt.show()

# Skewness and Kurtosis Before Transformation
original_skewness = skew(df['price_per_sqft'])
original_kurtosis = kurtosis(df['price_per_sqft'])
print("Original Skewness:", original_skewness)
print("Original Kurtosis:", original_kurtosis)

# Log Transformation
df['price_per_sqft_log'] = np.log1p(df['price_per_sqft'])

# Histogram After Log Transformation
plt.figure(figsize=(10, 5))
sns.histplot(df['price_per_sqft_log'], bins=50, kde=True)
plt.title('Histogram of Price Per Sqft (Log Transformed)')
plt.show()

# Skewness and Kurtosis After Transformation
transformed_skewness = skew(df['price_per_sqft_log'])
transformed_kurtosis = kurtosis(df['price_per_sqft_log'])
print("Transformed Skewness:", transformed_skewness)
print("Transformed Kurtosis:", transformed_kurtosis)

# Correlation Heatmap
correlation_matrix = df[['total_sqft', 'bath', 'price', 'bhk', 'price_per_sqft']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

# Scatter Plots
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['total_sqft'], y=df['price'])
plt.title('Scatter Plot: Price vs Total Sqft')
plt.xlabel('Total Sqft')
plt.ylabel('Price')
plt.show()

plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['total_sqft'], y=df['price_per_sqft'])
plt.title('Scatter Plot: Price Per Sqft vs Total Sqft')
plt.xlabel('Total Sqft')
plt.ylabel('Price Per Sqft')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\aryas\\Downloads\\house_price.csv'