In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Load data
train = pd.read_csv(
    "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
)
df = train.copy()

# ---------------------------------------------------------
# EDA (Exploratory Data Analysis)
# ---------------------------------------------------------

print("### 1. Target Variable Analysis (SalePrice) ###")
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.histplot(df["SalePrice"], kde=True)
plt.title("SalePrice Distribution")

plt.subplot(1, 2, 2)
sns.histplot(np.log1p(df["SalePrice"]), kde=True)
plt.title("Log-transformed SalePrice Distribution")
plt.tight_layout()
plt.savefig("log_transformed_saleprice_distribution.png")

print(f"Original Skewness: {df['SalePrice'].skew():.2f}")
print(f"Log-transformed Skewness: {np.log1p(df['SalePrice']).skew():.2f}")

print("\n### 2. Missing Value Analysis ###")
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
if not missing.empty:
    plt.figure(figsize=(12, 6))
    sns.barplot(x=missing.index, y=missing.values)
    plt.xticks(rotation=90)
    plt.title("Missing Values by Feature")
    plt.tight_layout()
    plt.savefig("missing_values_by_feature.png")
    print("Top Missing Features:\n", missing.head(10))
else:
    print("No missing values found.")

print("\n### 3. Correlation Analysis ###")
# Numeric features only for correlation
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()
top_corr_features = corr_matrix["SalePrice"].sort_values(ascending=False).head(15).index

plt.figure(figsize=(12, 10))
sns.heatmap(df[top_corr_features].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Top 15 Correlated Features with SalePrice")
plt.tight_layout()
plt.savefig("top_15_correlated_features.png")

print("\n### 4. Outlier Detection (Top Features) ###")
# GrLivArea is usually a strong indicator with outliers
plt.figure(figsize=(10, 5))
sns.scatterplot(x=df["GrLivArea"], y=df["SalePrice"])
plt.title("GrLivArea vs SalePrice")
plt.tight_layout()
plt.savefig("grlivarea_vs_saleprice_scatter.png")


# Based on common knowledge/EDA, we might drop outliers in GrLivArea
# (e.g., GrLivArea > 4000 and SalePrice < 300000)
# But let's stick to identifying them first.

print("\n### 5. Categorical Variable Analysis ###")
# Analyze OverallQual (it's numeric but acts like a category)
plt.figure(figsize=(10, 6))
sns.boxplot(x="OverallQual", y="SalePrice", data=df)
plt.title("OverallQual vs SalePrice")
plt.tight_layout()
plt.savefig("overallqual_vs_saleprice_boxplot.png")


# Analyze Neighborhood
plt.figure(figsize=(15, 6))
sns.boxplot(x="Neighborhood", y="SalePrice", data=df)
plt.xticks(rotation=90)
plt.title("Neighborhood vs SalePrice")
plt.tight_layout()
plt.savefig("neighborhood_vs_saleprice_boxplot.png")