In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install xgboost
!pip install optuna
!pip install lightgbm
!pip install catboost

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [2]:
import pandas as pd
import numpy as np # Import numpy for log1p and expm1
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso
from catboost import CatBoostRegressor
from sklearn.compose import TransformedTargetRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/CollabData/kaggle_API/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle competitions download home-data-for-ml-course

Downloading home-data-for-ml-course.zip to /content
  0% 0.00/386k [00:00<?, ?B/s]
100% 386k/386k [00:00<00:00, 91.8MB/s]


In [7]:
! unzip home-data-for-ml-course.zip

Archive:  home-data-for-ml-course.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: sample_submission.csv.gz  
  inflating: test.csv                
  inflating: test.csv.gz             
  inflating: train.csv               
  inflating: train.csv.gz            


In [8]:
# Load the data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Separate features and target variable
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing: Handling categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(exclude=['object']).columns

# Impute missing values in numerical features using the mean
imputer = SimpleImputer(strategy='mean') # Create an imputer instance
X_train[numerical_features] = imputer.fit_transform(X_train[numerical_features]) # Fit and transform on training data
X_val[numerical_features] = imputer.transform(X_val[numerical_features]) # Transform validation data
test_df[numerical_features] = imputer.transform(test_df[numerical_features]) # Transform test data


# Initialize KBinsDiscretizer with 'uniform' strategy for tree-like binning
kbd = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

# Apply KBinsDiscretizer to numerical features
X_train_kbd = kbd.fit_transform(X_train[numerical_features])
X_val_kbd = kbd.transform(X_val[numerical_features])
X_test_kbd = kbd.transform(test_df[numerical_features])

# Convert the output back to DataFrames
X_train_kbd = pd.DataFrame(X_train_kbd, columns=numerical_features, index=X_train.index)
X_val_kbd = pd.DataFrame(X_val_kbd, columns=numerical_features, index=X_val.index)
X_test_kbd = pd.DataFrame(X_test_kbd, columns=numerical_features, index=test_df.index)

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # Create an encoder instance
X_train_encoded = encoder.fit_transform(X_train[categorical_features]) # Fit and transform on training data
X_val_encoded = encoder.transform(X_val[categorical_features]) # Transform validation data
X_test_encoded = encoder.transform(test_df[categorical_features]) # Transform test data

# Convert encoded features to DataFrames with appropriate column names
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(categorical_features), index=X_train.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns=encoder.get_feature_names_out(categorical_features), index=X_val.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(categorical_features), index=test_df.index)


# Concatenate the preprocessed numerical features with encoded categorical features
X_train_processed = pd.concat([X_train_kbd, X_train_encoded], axis=1)
X_val_processed = pd.concat([X_val_kbd, X_val_encoded], axis=1)
X_test_processed = pd.concat([X_test_kbd, X_test_encoded], axis=1)

# Define tree_preprocessor and linear_preprocessor
tree_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
    ])

linear_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
    ])

In [9]:
def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.8),
        'subsample': trial.suggest_float('subsample', 0.4, 0.8),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'early_stopping_rounds': 10
    }

    model = xgb.XGBRegressor(**params)
    model.fit(X_train_processed, y_train,
              eval_set=[(X_val_processed, y_val)],
              verbose=False)

    return model.best_score

In [None]:
# Create and run an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Increased number of trials

# Get the best hyperparameters
best_params = study.best_params

[I 2025-02-16 14:25:57,753] A new study created in memory with name: no-name-882a32e0-1e28-4805-aa26-4f9c2e229423
[I 2025-02-16 14:26:08,186] Trial 0 finished with value: 50163.445463203374 and parameters: {'booster': 'gbtree', 'lambda': 0.011054261860797165, 'alpha': 1.5382520714105559, 'colsample_bytree': 0.3791888751455815, 'subsample': 0.6664879218564019, 'learning_rate': 0.0029738388846529103, 'n_estimators': 344, 'max_depth': 4, 'min_child_weight': 1, 'gamma': 0.6127603945410842, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 50163.445463203374.
[I 2025-02-16 14:43:39,209] Trial 1 finished with value: 68721.21571523014 and parameters: {'booster': 'dart', 'lambda': 0.11475870743850085, 'alpha': 0.08011365116935851, 'colsample_bytree': 0.5011647184974649, 'subsample': 0.500141519720536, 'learning_rate': 0.0007440534847432022, 'n_estimators': 596, 'max_depth': 3, 'min_child_weight': 9, 'gamma': 0.5355055563922101, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 50

In [None]:
# Pre-trained models with hyperparameters
xgb_tunned = XGBRegressor(n_estimators=6500, alpha=1.7938525031017074e-09, subsample=0.3231512729662032,
                          colsample_bytree=0.25528017285233484, max_depth=5, min_child_weight=2,
                          learning_rate=0.004828231865923587, gamma=0.0026151163125498213, random_state=1)

gbm_tunned = GradientBoostingRegressor(n_estimators=5500, max_depth=5, min_samples_leaf=14,
                                       learning_rate=0.006328507206504974, subsample=0.9170443266552768,
                                       max_features='sqrt', random_state=1)

lgbm_tunned = LGBMRegressor(n_estimators=7000, max_depth=7, learning_rate=0.002536841439596437,
                            min_data_in_leaf=22, subsample=0.7207500503954922, max_bin=210,
                            feature_fraction=0.30010067215105635, random_state=1, verbosity=-1)

catboost_tunned = CatBoostRegressor(iterations=4500, colsample_bylevel=0.05367479984702603,
                                    learning_rate=0.018477566955501026, random_strength=0.1321272840705348,
                                    depth=6, l2_leaf_reg=4, boosting_type='Plain', bootstrap_type='Bernoulli',
                                    subsample=0.7629052520889268, logging_level='Silent', random_state=1)

elasticnet_tunned = ElasticNet(max_iter=3993, alpha=0.0007824887724782356, l1_ratio=0.25,
                               tol=3.78681184748232e-06, random_state=1)

lasso_tunned = Lasso(max_iter=2345, alpha=0.00019885959230548468, tol=2.955506894549702e-05, random_state=1)

In [None]:
# Create pipelines for each model
pipe_xgb = Pipeline(steps=[
    ('tree_preprocessor', tree_preprocessor),
    ('regressor1', xgb_tunned),
])

pipe_gbm = Pipeline(steps=[
    ('tree_preprocessor', tree_preprocessor),
    ('regressor2', gbm_tunned),
])

pipe_lgbm = Pipeline(steps=[
    ('tree_preprocessor', tree_preprocessor),
    ('regressor3', lgbm_tunned),
])

pipe_catboost = Pipeline(steps=[
    ('tree_preprocessor', tree_preprocessor),
    ('regressor4', catboost_tunned),
])

pipe_Elasticnet = Pipeline(steps=[
    ('linear_preprocessor', linear_preprocessor),
    ('regressor5', elasticnet_tunned),
])

TargetTransformedElasticnet = TransformedTargetRegressor(regressor=pipe_Elasticnet, func=np.log1p, inverse_func=np.expm1)

pipe_Lasso = Pipeline(steps=[
    ('linear_preprocessor', linear_preprocessor),
    ('regressor6', lasso_tunned),
])

TargetTransformedLasso = TransformedTargetRegressor(regressor=pipe_Lasso, func=np.log1p, inverse_func=np.expm1)

In [None]:
# Define the stacking regressor
estimators = [
    ('xgb', pipe_xgb),
    ('gbm', pipe_gbm),
    ('lgbm', pipe_lgbm),
    ('catboost', pipe_catboost),
    ('elasticnet', TargetTransformedElasticnet),
    ('lasso', TargetTransformedLasso),
]

stacking_regressor = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression()
)

# Fit the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions
predictions = stacking_regressor.predict(X_test_processed)

In [None]:
# Model Training and Prediction
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train_processed, y_train)

predictions = final_model.predict(X_test_processed)

In [None]:
# Evaluate accuracy on validation set
val_predictions = final_model.predict(X_val_processed)

# Calculate RMSE without 'squared' argument and take the square root manually
rmse = mean_squared_error(y_val, val_predictions)**0.5

print(f"Validation RMSE: {rmse}")

In [None]:
#submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': predictions})
#submission_df.to_csv('submission.csv', index=False)