In [16]:
# Owen Benner Econ 484 Kaggle comp assignment
"""
1. Data Processing:
- Combined train/test datasets
- Handled missing values (median for numeric, mode for categorical)
- Converted categorical variables to numeric using LabelEncoder
- Created new features (TotalSF, TotalBathrooms, HouseAge, TotalPorchSF)

2. Base Models Performance (RMSLE scores - lower is better):
- CatBoost:  0.120249 (Best)
- XGBoost:   0.126613
- LightGBM:  0.132586

3. Stacking Implementation:
- Used all 3 gradient boosting models as base models
- Used Lasso as meta-model
- Final stacking score: 0.123707
 (better than XGBoost and LightGBM but not CatBoost)
"""
!pip install catboost
#preamble
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error
from sklearn.base import BaseEstimator, RegressorMixin, clone
import warnings
warnings.filterwarnings('ignore')



In [15]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

#combine train and test
def combine_data(train_df, test_df):
    train_target = train_df['SalePrice']
    train_df = train_df.drop('SalePrice', axis=1)
    all_data = pd.concat([train_df, test_df], axis=0)
    return all_data, train_target

#preprocessing
def preprocess_data(df):
    #handling missing values
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    #filling numeric missing values with median
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    #filling categorical missing values with mode
    for col in categorical_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    #converting categorical to numeric using label encoding
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

#time for some feature engineering
def add_features(df):
    # Total SF
    df['TotalSF'] = df['TotalBsmtSF'] + df['GrLivArea']
    # Total Bathrooms
    df['TotalBathrooms'] = (df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath'])
    # House Age when sold
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    # Total Porch SF
    df['TotalPorchSF'] = (df['WoodDeckSF'] + df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch'])
    return df

#applying these functions to the training and test data
all_data, target = combine_data(train_df, test_df)
all_data = preprocess_data(all_data)
all_data = add_features(all_data)

#splitting back into train and test
train_data = all_data.iloc[:len(train_df)]
test_data = all_data.iloc[len(train_df):]

print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

Train shape: (1460, 84)
Test shape: (1459, 84)


In [None]:
#XGBoost params
xgb_params = {
    'learning_rate': 0.01,
    'n_estimators': 3000,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror',
    'random_state': 42 }

#LightGBM params
lgb_params = {
    'learning_rate': 0.01,
    'n_estimators': 3000,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42 }

#CatBoost params
cat_params = {
    'learning_rate': 0.01,
    'iterations': 3000,
    'depth': 6,
    'random_state': 42 }

#instantiate models
xgb_model = xgb.XGBRegressor(**xgb_params)
lgb_model = lgb.LGBMRegressor(**lgb_params)
cat_model = CatBoostRegressor(**cat_params, verbose=False)

#take a look at preliminary performance
def rmsle_cv(model, X, y):
    kf = KFold(n_splits=10, shuffle=True, random_state=484)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_log_error", cv=kf))
    return rmse.mean()
print("XGBoost score:", rmsle_cv(xgb_model, train_data, target))
print("LightGBM score:", rmsle_cv(lgb_model, train_data, target))
print("CatBoost score:", rmsle_cv(cat_model, train_data, target))

In [13]:
#lets stack the models!
class StackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        self.base_predictions = None

    def fit(self, X, y):
        #generate oof predictions for stacking data
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        self.base_predictions = np.zeros((X.shape[0], len(self.base_models)))

        #train each base model using k-fold
        for i, model in enumerate(self.base_models):
            print(f"Training base model {i+1}/{len(self.base_models)}")
            for train_idx, val_idx in kf.split(X):
                #clone model to ensure fresh instance
                model_clone = clone(model)
                #train on training fold
                model_clone.fit(X.iloc[train_idx], y.iloc[train_idx])
                #predict on validation fold
                self.base_predictions[val_idx, i] = model_clone.predict(X.iloc[val_idx])

            #retrain base model on full dataset
            model.fit(X, y)

        #train meta model using base model predictions
        self.meta_model.fit(self.base_predictions, y)
        return self

    def predict(self, X):
        #make predictions with base models
        meta_features = np.column_stack([
            model.predict(X) for model in self.base_models
        ])
        #use meta model for final predictions
        return self.meta_model.predict(meta_features)

Training base model 1/3
Training base model 2/3
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3777
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 75
[LightGBM] [Info] Start training from score 181296.608137
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3757
[LightGBM] [Info] Number of data points in the train set: 934, number of used features: 74
[LightGBM] [Info] Start training from score 181116.488223
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000826 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can

In [None]:
#setting up our stacking model
from sklearn.linear_model import Lasso

#base models list
base_models = [
    xgb_model,
    lgb_model,
    cat_model
]

#instantiating meta model: lasso this time.
meta_model = Lasso(alpha=0.0005)

#instantiate stacking ensemble
stacker = StackingRegressor(base_models, meta_model)

#evaluate it with cross validation
def rmsle_cv(model, X, y, n_folds=5):
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    rmse = np.zeros(n_folds)

    for i, (train_idx, val_idx) in enumerate(kf.split(X)):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])
        pred = model.predict(X.iloc[val_idx])
        rmse[i] = np.sqrt(mean_squared_log_error(y.iloc[val_idx], pred))

    return rmse.mean()
stack_score = rmsle_cv(stacker, train_data, target)
print("\nModel Scores:")
#original scores
print(f"CatBoost:  {0.12024884970422434:.6f}")
print(f"XGBoost:   {0.12661306520175142:.6f}")
print(f"LightGBM:  {0.1325861852404577:.6f}")
#stacked model score
print(f"Stacking:  {stack_score:.6f}")