# House Price Prediction - Exploratory Data Analysis

**Goal:** Understand the dataset, identify patterns, and prepare for feature engineering

**Dataset:** Kaggle House Prices (Ames, Iowa)
- Training samples: 1,460
- Features: 81
- Target: SalePrice

## 1. Setup and Data Loading

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Libraries imported successfully!')

In [None]:
# Load training data
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')

print(f'Training data shape: {train_df.shape}')
print(f'Test data shape: {test_df.shape}')
print(f'\nTotal samples: {len(train_df) + len(test_df)}')

## 2. Initial Data Exploration

In [None]:
# Display first few rows
print('First 5 rows of training data:')
train_df.head()

In [None]:
# Data info
print('Dataset Information:')
train_df.info()

In [None]:
# Statistical summary
print('Statistical Summary of Numerical Features:')
train_df.describe()

In [None]:
# Identify data types
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

print(f'Numerical features: {len(numerical_features)}')
print(f'Categorical features: {len(categorical_features)}')
print(f'\nNumerical: {numerical_features[:10]} ...')
print(f'\nCategorical: {categorical_features[:10]} ...')

## 3. Target Variable Analysis (SalePrice)

In [None]:
# SalePrice statistics
print('SalePrice Statistics:')
print(train_df['SalePrice'].describe())
print(f'\nSkewness: {train_df["SalePrice"].skew():.2f}')
print(f'Kurtosis: {train_df["SalePrice"].kurtosis():.2f}')

In [None]:
# Visualize SalePrice distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(train_df['SalePrice'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Sale Price ($)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of House Prices', fontsize=14, fontweight='bold')
axes[0].axvline(train_df['SalePrice'].mean(), color='red', linestyle='--', label=f'Mean: ${train_df["SalePrice"].mean():,.0f}')
axes[0].axvline(train_df['SalePrice'].median(), color='green', linestyle='--', label=f'Median: ${train_df["SalePrice"].median():,.0f}')
axes[0].legend()

# Box plot
axes[1].boxplot(train_df['SalePrice'], vert=True)
axes[1].set_ylabel('Sale Price ($)', fontsize=12)
axes[1].set_title('Box Plot of House Prices', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f'Price range: ${train_df["SalePrice"].min():,.0f} - ${train_df["SalePrice"].max():,.0f}')

In [None]:
# Check for normal distribution (Q-Q plot)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Original SalePrice Q-Q plot
stats.probplot(train_df['SalePrice'], dist="norm", plot=axes[0])
axes[0].set_title('Q-Q Plot: SalePrice (Original)', fontsize=14, fontweight='bold')

# Log-transformed SalePrice Q-Q plot
stats.probplot(np.log1p(train_df['SalePrice']), dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot: SalePrice (Log-Transformed)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print('Note: Log transformation may help normalize the target variable for better model performance.')

## 4. Missing Values Analysis

In [None]:
# Calculate missing values
missing = train_df.isnull().sum()
missing_pct = (missing / len(train_df)) * 100
missing_df = pd.DataFrame({
    'Feature': missing.index,
    'Missing_Count': missing.values,
    'Missing_Percentage': missing_pct.values
})

# Filter features with missing values
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)

print(f'Features with missing values: {len(missing_df)}/{len(train_df.columns)}')
print('\nTop 10 features with most missing values:')
missing_df.head(10)

In [None]:
# Visualize missing values
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    top_missing = missing_df.head(15)
    plt.barh(top_missing['Feature'], top_missing['Missing_Percentage'], color='coral')
    plt.xlabel('Missing Percentage (%)', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title('Top 15 Features with Missing Values', fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print('No missing values found!')

## 5. Correlation Analysis

In [None]:
# Calculate correlation with SalePrice
correlations = train_df[numerical_features].corr()['SalePrice'].sort_values(ascending=False)

print('Top 10 features most correlated with SalePrice:')
print(correlations.head(11))  # Include SalePrice itself

print('\nBottom 10 features (least/negatively correlated):')
print(correlations.tail(10))

In [None]:
# Visualize top correlations
top_features = correlations.head(11).index.tolist()  # Top 10 + SalePrice

plt.figure(figsize=(12, 10))
sns.heatmap(train_df[top_features].corr(), annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True, linewidths=1)
plt.title('Correlation Heatmap: Top Features vs SalePrice', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Bar plot of top correlations
top_corr = correlations.head(11)[1:]  # Exclude SalePrice itself

plt.figure(figsize=(10, 6))
top_corr.plot(kind='barh', color='steelblue')
plt.xlabel('Correlation with SalePrice', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 10 Features Correlated with House Price', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Key Feature Analysis

In [None]:
# Scatter plots of top numerical features vs SalePrice
top_num_features = correlations.head(6)[1:].index.tolist()  # Top 5 excluding SalePrice

fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.flatten()

for idx, feature in enumerate(top_num_features):
    axes[idx].scatter(train_df[feature], train_df['SalePrice'], alpha=0.5)
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('SalePrice', fontsize=10)
    axes[idx].set_title(f'{feature} vs SalePrice (r={correlations[feature]:.2f})', 
                       fontsize=11, fontweight='bold')

# Remove extra subplot if odd number
if len(top_num_features) < 6:
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features (top 3)
top_categorical = ['Neighborhood', 'OverallQual', 'ExterQual']  # Common important ones

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, cat_feature in enumerate(top_categorical):
    if cat_feature in train_df.columns:
        # Calculate mean price per category
        mean_prices = train_df.groupby(cat_feature)['SalePrice'].mean().sort_values(ascending=False)
        
        axes[idx].barh(mean_prices.index.astype(str), mean_prices.values, color='coral')
        axes[idx].set_xlabel('Average Sale Price ($)', fontsize=10)
        axes[idx].set_ylabel(cat_feature, fontsize=10)
        axes[idx].set_title(f'Average Price by {cat_feature}', fontsize=12, fontweight='bold')
        axes[idx].invert_yaxis()

plt.tight_layout()
plt.show()

## 7. Outlier Detection

In [None]:
# Detect outliers using IQR method for top features
def detect_outliers_iqr(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    return len(outliers)

# Check outliers for top numerical features
outlier_counts = {}
for feature in top_num_features:
    outlier_counts[feature] = detect_outliers_iqr(train_df, feature)

print('Outlier counts for top features:')
for feature, count in outlier_counts.items():
    print(f'{feature}: {count} outliers ({count/len(train_df)*100:.1f}%)')

In [None]:
# Visualize potential outliers in SalePrice
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# GrLivArea vs SalePrice (common outlier relationship)
axes[0].scatter(train_df['GrLivArea'], train_df['SalePrice'], alpha=0.6)
axes[0].set_xlabel('Above Grade Living Area (sq ft)', fontsize=11)
axes[0].set_ylabel('Sale Price ($)', fontsize=11)
axes[0].set_title('Living Area vs Price (Check for outliers)', fontsize=12, fontweight='bold')

# TotalBsmtSF vs SalePrice
axes[1].scatter(train_df['TotalBsmtSF'], train_df['SalePrice'], alpha=0.6, color='orange')
axes[1].set_xlabel('Total Basement Area (sq ft)', fontsize=11)
axes[1].set_ylabel('Sale Price ($)', fontsize=11)
axes[1].set_title('Basement Area vs Price (Check for outliers)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print('\nNote: Large living areas with low prices might be outliers to investigate.')

## 8. Key Insights and Next Steps

In [None]:
# Summary statistics
print('='*60)
print('EDA SUMMARY AND KEY INSIGHTS')
print('='*60)
print(f'\n1. DATASET:')
print(f'   - Training samples: {len(train_df)}')
print(f'   - Features: {len(train_df.columns) - 1} (excluding target)')
print(f'   - Numerical features: {len(numerical_features)}')
print(f'   - Categorical features: {len(categorical_features)}')

print(f'\n2. TARGET VARIABLE (SalePrice):')
print(f'   - Mean: ${train_df["SalePrice"].mean():,.0f}')
print(f'   - Median: ${train_df["SalePrice"].median():,.0f}')
print(f'   - Range: ${train_df["SalePrice"].min():,.0f} - ${train_df["SalePrice"].max():,.0f}')
print(f'   - Skewness: {train_df["SalePrice"].skew():.2f} (right-skewed)')
print(f'   - Recommendation: Consider log transformation')

print(f'\n3. MISSING VALUES:')
print(f'   - Features with missing data: {len(missing_df)}')
if len(missing_df) > 0:
    print(f'   - Worst feature: {missing_df.iloc[0]["Feature"]} ({missing_df.iloc[0]["Missing_Percentage"]:.1f}% missing)')
print(f'   - Action: Need imputation strategy')

print(f'\n4. TOP PREDICTIVE FEATURES:')
for i, (feature, corr) in enumerate(correlations.head(6)[1:].items(), 1):
    print(f'   {i}. {feature}: {corr:.3f}')

print(f'\n5. NEXT STEPS (Feature Engineering):')
print(f'   - Handle missing values (imputation)')
print(f'   - Log-transform SalePrice and skewed features')
print(f'   - Encode categorical variables')
print(f'   - Remove or cap outliers')
print(f'   - Create new features (e.g., TotalSF, Age)')
print(f'   - Feature scaling/normalization')
print('='*60)

## 9. Save EDA Results

In [None]:
# Save correlation data for reference
correlations.to_csv('../data/processed/feature_correlations.csv', header=['Correlation'])
print('Correlation data saved to: data/processed/feature_correlations.csv')

# Save missing values report
if len(missing_df) > 0:
    missing_df.to_csv('../data/processed/missing_values_report.csv', index=False)
    print('Missing values report saved to: data/processed/missing_values_report.csv')

print('\nEDA Complete! Ready for Phase 3: Feature Engineering & Model Training')