In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
# Suppress convergence warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# Load data
train_df = pd.read_csv('/kaggle/input/houseprice/train.csv')
test_df = pd.read_csv('/kaggle/input/houseprice/train.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTrain columns: {list(train_df.columns)}")

# Check for missing values
print(f"\nMissing values in training data:")
missing_train = train_df.isnull().sum()
print(missing_train[missing_train > 0].sort_values(ascending=False))

# Target variable analysis
print(f"\nTarget variable (SalePrice) statistics:")
print(train_df['SalePrice'].describe())

# Log transform target for better distribution (common for price data)
y = np.log1p(train_df['SalePrice'])

# Feature engineering based on actual Ames dataset columns
def feature_engineering(df):
    df = df.copy()
    
    # Total area features (these columns exist in Ames dataset)
    if all(col in df.columns for col in ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']):
        df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    
    # Bathroom features
    bathroom_cols = ['FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
    if all(col in df.columns for col in bathroom_cols):
        df['TotalBath'] = df['FullBath'] + 0.5 * df['HalfBath'] + df['BsmtFullBath'] + 0.5 * df['BsmtHalfBath']
    
    # Porch area features  
    porch_cols = ['OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF']
    existing_porch_cols = [col for col in porch_cols if col in df.columns]
    if existing_porch_cols:
        df['TotalPorch'] = df[existing_porch_cols].sum(axis=1)
    
    # Binary indicators
    if 'PoolArea' in df.columns:
        df['HasPool'] = (df['PoolArea'] > 0).astype(int)
    if 'GarageArea' in df.columns:
        df['HasGarage'] = (df['GarageArea'] > 0).astype(int)
    if 'TotalBsmtSF' in df.columns:
        df['HasBsmt'] = (df['TotalBsmtSF'] > 0).astype(int)
    if 'Fireplaces' in df.columns:
        df['HasFireplace'] = (df['Fireplaces'] > 0).astype(int)
    
    # Age features
    if all(col in df.columns for col in ['YrSold', 'YearBuilt']):
        df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    if all(col in df.columns for col in ['YrSold', 'YearRemodAdd']):
        df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    if 'HouseAge' in df.columns:
        df['IsNew'] = (df['HouseAge'] <= 2).astype(int)
    
    # Quality interactions (these are key features in Ames dataset)
    if all(col in df.columns for col in ['OverallQual', 'GrLivArea']):
        df['OverallQual_x_GrLivArea'] = df['OverallQual'] * df['GrLivArea']
    if all(col in df.columns for col in ['OverallQual', 'TotalSF']):
        df['OverallQual_x_TotalSF'] = df['OverallQual'] * df['TotalSF']
    
    # Handle specific Ames dataset issues
    # Some categorical variables have 'NA' as a category, not missing values
    na_as_category = ['Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
                      'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 
                      'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
    
    for col in na_as_category:
        if col in df.columns:
            df[col] = df[col].fillna('None')
    
    return df



Train shape: (1460, 81)
Test shape: (1460, 81)

Train columns: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'E

In [5]:
# Apply feature engineering
train_processed = feature_engineering(train_df)
test_processed = feature_engineering(test_df)

In [6]:
# Prepare features (remove Id and target)
X = train_processed.drop(columns=['SalePrice', 'Id'])
X_test = test_processed.drop(columns=['Id'])

# Ensure same columns in train and test
common_cols = X.columns.intersection(X_test.columns)
X = X[common_cols]
X_test = X_test[common_cols]

print(f"\nFeatures after engineering: {X.shape[1]}")



Features after engineering: 91


In [12]:

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify column types
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numerical columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")


Numerical columns: 48
Categorical columns: 43


In [13]:

# Show some actual column names
print(f"\nSample numerical columns: {num_cols[:10]}")
print(f"Sample categorical columns: {cat_cols[:10]}")

# Preprocessing pipelines
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())  # Better for outliers than StandardScaler
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ]
)


Sample numerical columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2']
Sample categorical columns: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1']


In [14]:


# Base models - ONLY LINEAR REGRESSION variants for ensemble
# Different regularization approaches but all linear models
base_models = [
    ('linear', LinearRegression()),
    ('ridge', RidgeCV(alphas=np.logspace(-3, 2, 30), cv=5)),
    ('lasso', LassoCV(alphas=np.logspace(-3, 1, 30), cv=5, max_iter=5000, tol=1e-3)),
    ('elastic', ElasticNetCV(alphas=np.logspace(-3, 1, 20), 
                            l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95], 
                            cv=5, max_iter=5000, tol=1e-3))
]

# Advanced ensemble: Stacking with linear regression meta-learner
stacking_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('stacking', StackingRegressor(
        estimators=base_models,
        final_estimator=LinearRegression(),
        cv=5,
        n_jobs=-1
    ))
])

# Alternative: Simple averaging ensemble
voting_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('voting', VotingRegressor(
        estimators=base_models,
        n_jobs=-1
    ))
])


In [15]:

# Cross-validation evaluation
def evaluate_model(model, X, y, cv=5):
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=kfold, 
                           scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-scores)  # Convert to RMSE

print("\n" + "="*60)
print("CROSS-VALIDATION EVALUATION")
print("="*60)

# Evaluate models
print("\nEvaluating Stacking Regressor (Linear + Ridge + Lasso + ElasticNet)...")
stacking_scores = evaluate_model(stacking_regressor, X_train, y_train)
print(f"Stacking CV RMSE: {stacking_scores.mean():.4f} (+/- {stacking_scores.std() * 2:.4f})")

print("\nEvaluating Voting Regressor...")
voting_scores = evaluate_model(voting_regressor, X_train, y_train)
print(f"Voting CV RMSE: {voting_scores.mean():.4f} (+/- {voting_scores.std() * 2:.4f})")

# Train the final model
print("\n" + "="*60)
print("TRAINING FINAL MODEL")
print("="*60)

# Use the better performing model
if stacking_scores.mean() < voting_scores.mean():
    final_model = stacking_regressor
    model_name = "Stacking Regressor"
else:
    final_model = voting_regressor
    model_name = "Voting Regressor"

print(f"\nUsing {model_name} as final model")
final_model.fit(X_train, y_train)

# Validation predictions
y_val_pred = final_model.predict(X_val)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_mae = mean_absolute_error(y_val, y_val_pred)

print(f"\nValidation Metrics:")
print(f"RMSE: {val_rmse:.4f}")
print(f"MAE: {val_mae:.4f}")

# Generate test predictions
print("\n" + "="*60)
print("GENERATING PREDICTIONS")
print("="*60)

test_predictions = final_model.predict(X_test)
# Convert back from log scale
test_predictions = np.expm1(test_predictions)

# Create submission file
submission = pd.DataFrame({
    'Id': test_df['Id'],
    'SalePrice': test_predictions
})

submission.to_csv('submission.csv', index=False)

print(f"Submission created with {len(submission)} predictions")
print(f"Price range: ${test_predictions.min():,.0f} - ${test_predictions.max():,.0f}")
print(f"Mean price: ${test_predictions.mean():,.0f}")

# Show first few predictions
print(f"\nFirst 10 predictions:")
print(submission.head(10))

# Model summary
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"✓ Dataset: Ames Housing with {X.shape[1]} features")
print(f"✓ Target: Log-transformed SalePrice")
print(f"✓ Base Models: Linear, Ridge, Lasso, ElasticNet regression")
print(f"✓ Ensemble: {model_name}")
print(f"✓ Preprocessing: Robust scaling + One-hot encoding") 
print(f"✓ Cross-validation RMSE: {min(stacking_scores.mean(), voting_scores.mean()):.4f}")
print(f"✓ Validation RMSE: {val_rmse:.4f}")
print("✓ Ready for Kaggle submission!")

# Additional insights
print(f"\nModel Insights:")
print(f"• Used {len(num_cols)} numerical and {len(cat_cols)} categorical features")
print(f"• Log transformation helps with price distribution")
print(f"• Ensemble combines linear models with different regularization")
print(f"• Cross-validation ensures robust performance estimates")


CROSS-VALIDATION EVALUATION

Evaluating Stacking Regressor (Linear + Ridge + Lasso + ElasticNet)...




Stacking CV RMSE: 19215.4360 (+/- 47409.8394)

Evaluating Voting Regressor...




Voting CV RMSE: 11752.2496 (+/- 29351.1865)

TRAINING FINAL MODEL

Using Voting Regressor as final model

Validation Metrics:
RMSE: 244.5080
MAE: 19.4654

GENERATING PREDICTIONS
Submission created with 1460 predictions
Price range: $47,611 - $inf
Mean price: $inf

First 10 predictions:
   Id      SalePrice
0   1  208744.397106
1   2  198919.880255
2   3  218620.236644
3   4  168733.073957
4   5  296032.993874
5   6  156248.435923
6   7  275162.185225
7   8  212350.659869
8   9  133867.887691
9  10  117294.026676

MODEL SUMMARY
✓ Dataset: Ames Housing with 91 features
✓ Target: Log-transformed SalePrice
✓ Base Models: Linear, Ridge, Lasso, ElasticNet regression
✓ Ensemble: Voting Regressor
✓ Preprocessing: Robust scaling + One-hot encoding
✓ Cross-validation RMSE: 11752.2496
✓ Validation RMSE: 244.5080
✓ Ready for Kaggle submission!

Model Insights:
• Used 48 numerical and 43 categorical features
• Log transformation helps with price distribution
• Ensemble combines linear models with d