In [1]:
# Cell 1: Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from scipy.special import boxcox1p
from scipy.stats import skew
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
# Cell 2: Load Data
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"\nFirst few rows of train data:")
print(train.head())

Train shape: (1460, 81)
Test shape: (1459, 80)

First few rows of train data:
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0  

In [4]:
# Cell 3: Save Test IDs and Remove Outliers
test_ID = test['Id']

# Remove outliers (recommended in competition description)
print(f"Before removing outliers: {train.shape}")
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
print(f"After removing outliers: {train.shape}")

Before removing outliers: (1460, 81)
After removing outliers: (1458, 81)


In [5]:
# Cell 4: Log Transform Target Variable
train["SalePrice"] = np.log1p(train["SalePrice"])
y_train = train['SalePrice'].values

train.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

print(f"Target variable (log-transformed) shape: {y_train.shape}")
print(f"Train features shape: {train.shape}")
print(f"Test features shape: {test.shape}")

Target variable (log-transformed) shape: (1458,)
Train features shape: (1458, 79)
Test features shape: (1459, 79)


In [6]:
# Cell 5: Feature Engineering Function
def engineer_features(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBath'] = (df['FullBath'] + (0.5 * df['HalfBath']) + df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))
    df['TotalPorchSF'] = (df['OpenPorchSF'] + df['3SsnPorch'] + df['EnclosedPorch'] + df['ScreenPorch'] + df['WoodDeckSF'])
    df['HasPool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    df['Has2ndFloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    df['HasGarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    df['HasBsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    df['OverallGrade'] = df['OverallQual'] * df['OverallCond']
    df['QualGrLiv'] = df['OverallQual'] * df['GrLivArea']
    df['QualBsmt'] = df['OverallQual'] * df['TotalBsmtSF']
    df['QualGarage'] = df['OverallQual'] * df['GarageArea']
    df['QualPorch'] = df['OverallQual'] * df['TotalPorchSF']
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
    df['GarageAge'] = df['YrSold'] - df['GarageYrBlt']
    df['IsRemodeled'] = (df['YearRemodAdd'] != df['YearBuilt']).astype(int)
    df['IsNewHouse'] = (df['YearBuilt'] == df['YrSold']).astype(int)
    df['SimplOverallQual'] = df['OverallQual'].replace({1: 1, 2: 1, 3: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3, 10: 3})
    df['SimplExterQual'] = df['ExterQual'].replace({'Po': 1, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
    df['SimplKitchenQual'] = df['KitchenQual'].replace({'Po': 1, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4})
    df['SimplBsmtQual'] = df['BsmtQual'].replace({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['SimplGarageQual'] = df['GarageQual'].replace({'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    df['SimplHeatingQC'] = df['HeatingQC'].replace({'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5})
    return df

print("Feature engineering function defined!")

Feature engineering function defined!


In [7]:
# Cell 6: Missing Values Handling Function
def handle_missing_values(df):
    none_cols = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
    for col in none_cols:
        if col in df.columns:
            df[col].fillna('None', inplace=True)
    zero_cols = ['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']
    for col in zero_cols:
        if col in df.columns:
            df[col].fillna(0, inplace=True)
    if 'LotFrontage' in df.columns:
        if 'Neighborhood' in df.columns:
            df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
        else:
            df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace=True)
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            if df[col].dtype == 'object':
                df[col].fillna(df[col].mode()[0], inplace=True)
            else:
                df[col].fillna(df[col].median(), inplace=True)
    return df

print("Missing values handling function defined!")

Missing values handling function defined!


In [8]:
# Cell 7: Feature Encoding Function
def encode_features(df):
    ordinal_mappings = {
        'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'BsmtQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'BsmtCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'BsmtExposure': {'None': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
        'BsmtFinType1': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
        'BsmtFinType2': {'None': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
        'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'FireplaceQu': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'GarageQual': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'GarageCond': {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'PoolQC': {'None': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
        'Fence': {'None': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
    }
    for col, mapping in ordinal_mappings.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)
    df = pd.get_dummies(df, drop_first=True)
    return df

print("Feature encoding function defined!")

Feature encoding function defined!


In [10]:
# Cell 8: Fix Skewness Function
def fix_skewness(df, threshold=0.75):
    # Get numeric columns only
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Remove binary columns
    numeric_cols = [col for col in numeric_cols if df[col].nunique() > 2]
    if len(numeric_cols) == 0:
        print("No numeric features to fix skewness")
        return df
    # Calculate skewness safely
    skewness_dict = {}
    for col in numeric_cols:
        try:
            col_skew = skew(df[col].dropna())
            if not np.isnan(col_skew) and not np.isinf(col_skew):
                skewness_dict[col] = col_skew
        except:
            continue
    if not skewness_dict:
        print("Could not calculate skewness for any features")
        return df
    skewness = pd.DataFrame({'Skew': skewness_dict}).sort_values(by='Skew', ascending=False)
    skewness = skewness[abs(skewness['Skew']) > threshold]
    print(f"Number of skewed features (|skew| > {threshold}): {len(skewness)}")
    lam = 0.15
    for feat in skewness.index:
        try:
            df[feat] = boxcox1p(df[feat], lam)
        except:
            continue
    return df

print("Skewness fixing function defined!")

Skewness fixing function defined!


In [11]:
# Cell 9: Apply All Preprocessing Steps
print("Step 1: Handling Missing Values FIRST...")
train = handle_missing_values(train)
test = handle_missing_values(test)
print("Missing values handled!")

print("\nStep 2: Feature Engineering...")
train = engineer_features(train)
test = engineer_features(test)
print(f"Train shape after feature engineering: {train.shape}")

print("\nStep 3: Encoding Features...")
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)
print(f"Combined data shape: {all_data.shape}")
all_data = encode_features(all_data)
print(f"After encoding: {all_data.shape}")

print("\nStep 4: Fixing Skewness...")
all_data = fix_skewness(all_data)

train = all_data[:ntrain]
test = all_data[ntrain:]

print("\nStep 5: Final NaN check and filling...")
print(f"Train NaN count: {train.isnull().sum().sum()}")
print(f"Test NaN count: {test.isnull().sum().sum()}")
if train.isnull().sum().sum() > 0 or test.isnull().sum().sum() > 0:
    print("Filling remaining NaN values with 0...")
    train = train.fillna(0)
    test = test.fillna(0)
print(f"\nFinal train shape: {train.shape}")
print(f"Final test shape: {test.shape}")

Step 1: Handling Missing Values FIRST...
Missing values handled!

Step 2: Feature Engineering...
Train shape after feature engineering: (1458, 103)

Step 3: Encoding Features...
Combined data shape: (2917, 103)
After encoding: (2917, 243)

Step 4: Fixing Skewness...
Number of skewed features (|skew| > 0.75): 39

Step 5: Final NaN check and filling...
Train NaN count: 0
Test NaN count: 1
Filling remaining NaN values with 0...

Final train shape: (1458, 243)
Final test shape: (1459, 243)


In [12]:
# Cell 10: Scale Features
scaler = RobustScaler()
X_train = scaler.fit_transform(train)
X_test = scaler.transform(test)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print("\nData is ready for modeling!")

X_train shape: (1458, 243)
X_test shape: (1459, 243)
y_train shape: (1458,)

Data is ready for modeling!


In [13]:
# Cell 11: Stacking Model Classes
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)

class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
    def predict(self, X):
        meta_features = np.column_stack([np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.base_models_])
        return self.meta_model_.predict(meta_features)

print("Stacking model classes defined!")

Stacking model classes defined!


In [14]:
# Cell 12: Define Base Models
print("Defining models...")
lasso = Lasso(alpha=0.0005, random_state=42, max_iter=10000)
ridge = Ridge(alpha=10, random_state=42)
elasticnet = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=42, max_iter=10000)
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=42)
xgboost = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, random_state=42, n_jobs=-1)
lightgbm = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11, random_state=42, verbose=-1)
print("All models defined!")

Defining models...
All models defined!


In [15]:
# Cell 13: Create Stacked Model
print("Creating stacked model...")
stacked_averaged_models = StackingAveragedModels(base_models=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm), meta_model=ridge)
print("Stacked model created!")

Creating stacked model...
Stacked model created!


In [16]:
# Cell 14: Train Stacked Model (This will take several minutes)
print("Training stacked model...")
print("This may take 5-10 minutes...\n")
stacked_averaged_models.fit(X_train, y_train)
print("Stacked model training completed!\n")
stacked_train_pred = stacked_averaged_models.predict(X_train)
stacked_pred = stacked_averaged_models.predict(X_test)
stacked_rmse = np.sqrt(mean_squared_error(y_train, stacked_train_pred))
print(f"Stacked Model Training RMSE: {stacked_rmse:.5f}")

Training stacked model...
This may take 5-10 minutes...

Stacked model training completed!

Stacked Model Training RMSE: 0.08189


In [17]:
# Cell 15: Train XGBoost Model
print("Training XGBoost model...")
xgboost.fit(X_train, y_train)
xgb_train_pred = xgboost.predict(X_train)
xgb_pred = xgboost.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_train, xgb_train_pred))
print(f"XGBoost Training RMSE: {xgb_rmse:.5f}")

Training XGBoost model...
XGBoost Training RMSE: 0.08501


In [18]:
# Cell 16: Train LightGBM Model
print("Training LightGBM model...")
lightgbm.fit(X_train, y_train)
lgb_train_pred = lightgbm.predict(X_train)
lgb_pred = lightgbm.predict(X_test)
lgb_rmse = np.sqrt(mean_squared_error(y_train, lgb_train_pred))
print(f"LightGBM Training RMSE: {lgb_rmse:.5f}")

Training LightGBM model...
LightGBM Training RMSE: 0.06917


In [19]:
# Cell 17: Create Ensemble Predictions and Submission
print("Creating ensemble predictions...")
ensemble_pred = stacked_pred * 0.70 + xgb_pred * 0.15 + lgb_pred * 0.15
ensemble_pred = np.expm1(ensemble_pred)
submission = pd.DataFrame({'Id': test_ID, 'SalePrice': ensemble_pred})
submission.to_csv('submission.csv', index=False)
print("✓ Submission file created: submission.csv")
print(f"✓ Total predictions: {len(submission)}")

Creating ensemble predictions...
✓ Submission file created: submission.csv
✓ Total predictions: 1459


In [20]:
# Cell 18: Summary
print("=" * 60)
print("FINAL MODEL SUMMARY")
print("=" * 60)
print(f"\nTraining RMSE Scores:")
print(f"  Stacked Model: {stacked_rmse:.5f}")
print(f"  XGBoost:       {xgb_rmse:.5f}")
print(f"  LightGBM:      {lgb_rmse:.5f}")
print(f"\nExpected Kaggle Score: ~0.10 - 0.115")
print("=" * 60)
print(submission.head(10))

FINAL MODEL SUMMARY

Training RMSE Scores:
  Stacked Model: 0.08189
  XGBoost:       0.08501
  LightGBM:      0.06917

Expected Kaggle Score: ~0.10 - 0.115
     Id      SalePrice
0  1461  121307.690609
1  1462  159905.288276
2  1463  180975.959669
3  1464  195776.685999
4  1465  190247.389224
5  1466  171904.316137
6  1467  176145.425293
7  1468  163604.639984
8  1469  189707.971461
9  1470  122646.580007
