#### Import Libraries

In [None]:
# Import essential libraries for modeling
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set plotting style
plt.style.use('default')
sns.set_palette("viridis")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

####  Load and Examine Cleaned Dataset

In [None]:
# Load the cleaned dataset
df = pd.read_csv('../data/train_cleaned.csv', keep_default_na=False, na_values=[])

print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")

# Display basic info
print(f"\nData Types:")
print(f"Numerical features: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical features: {len(df.select_dtypes(exclude=[np.number]).columns)}")

# Show first few rows
print(f"\nFirst 3 rows:")
df.head(3)

#### Correlation Analysis with Target Variable

In [None]:
# Select only numerical features for correlation analysis
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Found {len(numerical_features)} numerical features")

# Calculate correlation with target variable (SalePrice) - numerical features only
correlations = df[numerical_features].corr()['SalePrice'].sort_values(ascending=False)

print("\nTOP 15 FEATURES MOST CORRELATED WITH SALEPRICE:")
print("=" * 55)
top_15 = correlations.head(16)[1:]  # Exclude SalePrice itself
for feature, corr in top_15.items():
    print(f"{feature:<25} {corr:>8.3f}")

print(f"\nTOP 10 NEGATIVE CORRELATIONS:")
print("=" * 55)
bottom_10 = correlations.tail(10)
for feature, corr in bottom_10.items():
    print(f"{feature:<25} {corr:>8.3f}")

print(f"\nStrong correlations (|r| > 0.5): {len(correlations[correlations.abs() > 0.5]) - 1}")
print(f"Moderate correlations (0.3 < |r| < 0.5): {len(correlations[(correlations.abs() > 0.3) & (correlations.abs() <= 0.5)])}")

TOP PERFORMERS:

    TotalSF (0.782) - engineered total square footage feature is #2

    TotalBath (0.613) - bathroom count feature made top 10

    ExteriorQualityAvg (0.590) - quality average is working great


KEY INSIGHTS:
    
    20 strong correlations (|r| > 0.5) - Excellent feature set

    Quality features dominate - OverallQual, ExterQual_Ordinal, KitchenQual_Ordinal

    Size matters most - TotalSF, GrLivArea, GarageArea all high

    Age is negative - Older houses worth less (HouseAge: -0.523)

#### Analyze Your Engineered Features Performance

In [None]:
# Identify and analyze engineered features
engineered_features = ['TotalSF', 'TotalBath', 'TotalPorchSF', 'HouseAge', 'YearsSinceRemod', 
                      'GarageAge', 'WasRemodeled', 'LivingAreaRatio', 'BasementRatio',
                      'ExteriorQualityAvg', 'BasementQualityAvg', 'HasPool', 'HasGarage', 
                      'HasBasement', 'HasFireplace', 'Has2ndFloor', 'HasMasVnr', 
                      'HasWoodDeck', 'HasOpenPorch']

# Get correlations for engineered features
eng_correlations = correlations[engineered_features].sort_values(ascending=False, key=abs)

print("ENGINEERED FEATURES PERFORMANCE:")
print("=" * 55)
for feature, corr in eng_correlations.items():
    status = "🔥" if abs(corr) > 0.5 else "✅" if abs(corr) > 0.3 else "📊"
    print(f"{status} {feature:<20} {corr:>8.3f}")

print(f"\nEngineered features with strong correlation: {len(eng_correlations[eng_correlations.abs() > 0.5])}")
print(f"Engineered features with moderate correlation: {len(eng_correlations[(eng_correlations.abs() > 0.3) & (eng_correlations.abs() <= 0.5)])}")


STAR PERFORMERS:

    TotalSF (0.782) - best engineered feature!

    TotalBath (0.613) - Bathroom count is highly predictive

    Quality averages - Both exterior and basement quality averages work great

    Age features - Negative correlations make perfect sense (newer = more valuable)

#### Visualize Top Correlations

In [None]:
# Create correlation visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Top positive correlations
top_positive = correlations.head(16)[1:]  # Exclude SalePrice itself
top_positive.plot(kind='barh', ax=ax1, color='skyblue')
ax1.set_title('Top 15 Positive Correlations with SalePrice', fontsize=14, fontweight='bold')
ax1.set_xlabel('Correlation Coefficient')
ax1.grid(axis='x', alpha=0.3)

# Top negative correlations
top_negative = correlations.tail(10)
top_negative.plot(kind='barh', ax=ax2, color='lightcoral')
ax2.set_title('Top 10 Negative Correlations with SalePrice', fontsize=14, fontweight='bold')
ax2.set_xlabel('Correlation Coefficient')
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print(f"CORRELATION ANALYSIS SUMMARY:")
print(f"   • Total numerical features analyzed: {len(correlations)-1}")
print(f"   • Strong positive correlations (r > 0.5): {len(correlations[correlations > 0.5])-1}")
print(f"   • Strong negative correlations (r < -0.5): {len(correlations[correlations < -0.5])}")
print(f"   • Your engineered features in top 15: {len([f for f in top_positive.index if f in engineered_features])}")


KEY INSIGHTS FROM THE CHART:
    
    4 of the engineered features made it into the top 15 (TotalSF, TotalBath, ExteriorQualityAvg, BasementQualityAvg)
    
    TotalSF is #2 - best engineered feature, beating many original features
    
    Age features dominate negatives - HouseAge and YearsSinceRemod are the strongest negative predictors

#### Correlation Heatmap for Top Features

In [None]:
# Create correlation heatmap for top features (excluding target)
top_features = correlations.abs().sort_values(ascending=False).head(21).index.tolist()
top_features.remove('SalePrice')  # Remove target variable

print(f"Analyzing multicollinearity among top {len(top_features)} features (excluding SalePrice)")

# Calculate correlation matrix for top features
correlation_matrix = df[top_features].corr()

# Create heatmap
plt.figure(figsize=(16, 12))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Mask upper triangle
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8}, fmt='.2f')
plt.title('Feature-to-Feature Correlation Heatmap - Top 20 Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()
# plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches='tight')

# Identify highly correlated feature pairs (potential multicollinearity)
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.8:  # High correlation threshold
            high_corr_pairs.append((correlation_matrix.columns[i], correlation_matrix.columns[j], corr_val))

print(f"\nHIGH CORRELATION PAIRS (|r| > 0.8) - Potential Multicollinearity:")
for feat1, feat2, corr in high_corr_pairs:
    print(f"   {feat1} ↔ {feat2}: {corr:.3f}")


Strong correlations you found

    TotalSF ↔ GrLivArea (0.874) → Not surprising: TotalSF includes both basement + above-ground area, while GrLivArea is above-ground only.

    TotalSF ↔ TotalBsmtSF (0.827) and TotalSF ↔ 1stFlrSF (0.800) → Same story: TotalSF aggregates other features.

    GrLivArea ↔ TotRmsAbvGrd (0.825) → Makes sense: more above-ground area usually means more rooms.

    ExterQual_Ordinal ↔ ExteriorQualityAvg (0.855) and BsmtQual_Ordinal ↔ BasementQualityAvg (0.840) → These are engineered averages that directly include those ordinals → they should be correlated.
    
    GarageCars ↔ GarageArea (0.882) → Very strong: more garage spaces = more area. Redundant predictors.
    
    TotalBsmtSF ↔ 1stFlrSF (0.820) → Larger houses often have both big basements and big first floors.
    
    HouseAge ↔ YearBuilt (-1.000) → Perfect negative correlation because one is derived from the other.
    
    YearRemodAdd ↔ YearsSinceRemod (-1.000) → Same: perfect inverse because one is the complement of the other.

What this means for your model

    Multicollinearity alert : Features like TotalSF, GrLivArea, 1stFlrSF, and TotalBsmtSF are almost telling the same story. Linear models (like regression, Lasso, Ridge) could be unstable if we include all of them.
    
    Engineered features vs originals: For example, ExterQual_Ordinal and ExteriorQualityAvg → we probably only need one. Same for HouseAge vs YearBuilt.
    
    Tree-based models (Random Forest, XGBoost, LightGBM) are less sensitive, but even then, redundant features can dilute importance.

In practice:

    For linear models: drop some of the redundant ones or use regularization (Lasso).
    
    For tree models: we can keep them, but still worth pruning for efficiency.
    
    For feature selection: check Variance Inflation Factor (VIF) to confirm multicollinearity.