In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from objectiveOpt import studier

In [2]:
# Load datasets
train = pd.read_csv('data/train.csv')
train = train.drop('Id', axis=1)
test = pd.read_csv('data/test.csv')
test_ids = test.pop('Id')

In [3]:
# Fill missing values for specific columns
train.fillna({'LotFrontage': 0, 'MiscVal': 0}, inplace=True)

In [4]:
# Feature engineering
train['year_qual'] = train['YearBuilt'] * train['OverallQual']
train['year_r_qual'] = train['YearRemodAdd'] * train['OverallQual']
train['qual_bsmt'] = train['OverallQual'] * train['TotalBsmtSF']
train['qual_fl'] = train['OverallQual'] * train['1stFlrSF']
train['qual_gr'] = train['OverallQual'] * train['GrLivArea']
train['qual_gar_area'] = train['OverallQual'] * train['GarageArea']
train['qual_gar_cars'] = train['OverallQual'] * train['GarageCars']
train['qual_bath'] = train['OverallQual'] * train['FullBath']
train['qual_bed'] = train['OverallQual'] * train['BedroomAbvGr']
train['qual_kit'] = train['OverallQual'] * train['KitchenAbvGr']
train['qual_fire'] = train['OverallQual'] * train['Fireplaces']
train['qual_wd'] = train['OverallQual'] * train['WoodDeckSF']
train['qual_op'] = train['OverallQual'] * train['OpenPorchSF']
train['qual_en'] = train['OverallQual'] * train['EnclosedPorch']
train['qual_3s'] = train['OverallQual'] * train['3SsnPorch']
train['qual_scr'] = train['OverallQual'] * train['ScreenPorch']
train['qual_pool'] = train['OverallQual'] * train['PoolArea']
train['qual_mo'] = train['OverallQual'] * train['MoSold']
train['qual_yr'] = train['OverallQual'] * train['YrSold']
train['total_sqft'] = train['GrLivArea'] + train['TotalBsmtSF']
train['total_bathrooms'] = train['FullBath'] + (0.5 * train['HalfBath']) + train['BsmtFullBath'] + (0.5 * train['BsmtHalfBath'])
train['house_age'] = train['YrSold'] - train['YearBuilt']
train['remod_age'] = train['YrSold'] - train['YearRemodAdd']
train['price_per_sqft'] = train['total_sqft'] * train['OverallQual']
train['garage_age'] = train['GarageYrBlt'] - train['YearBuilt']
train['total_porch'] = train['OpenPorchSF'] + train['EnclosedPorch'] + train['3SsnPorch'] + train['ScreenPorch']
train['has_pool'] = (train['PoolArea'] > 0).astype(int)
train['has_garage'] = (train['GarageArea'] > 0).astype(int)
train['has_basement'] = (train['TotalBsmtSF'] > 0).astype(int)
train['total_area'] = train['total_sqft'] + train['total_porch']
train['quality_score'] = train['OverallQual'] * train['OverallCond']

test['year_qual'] = test['YearBuilt'] * test['OverallQual']
test['year_r_qual'] = test['YearRemodAdd'] * test['OverallQual']
test['qual_bsmt'] = test['OverallQual'] * test['TotalBsmtSF']
test['qual_fl'] = test['OverallQual'] * test['1stFlrSF']
test['qual_gr'] = test['OverallQual'] * test['GrLivArea']
test['qual_gar_area'] = test['OverallQual'] * test['GarageArea']
test['qual_gar_cars'] = test['OverallQual'] * test['GarageCars']
test['qual_bath'] = test['OverallQual'] * test['FullBath']
test['qual_bed'] = test['OverallQual'] * test['BedroomAbvGr']
test['qual_kit'] = test['OverallQual'] * test['KitchenAbvGr']
test['qual_fire'] = test['OverallQual'] * test['Fireplaces']
test['qual_wd'] = test['OverallQual'] * test['WoodDeckSF']
test['qual_op'] = test['OverallQual'] * test['OpenPorchSF']
test['qual_en'] = test['OverallQual'] * test['EnclosedPorch']
test['qual_3s'] = test['OverallQual'] * test['3SsnPorch']
test['qual_scr'] = test['OverallQual'] * test['ScreenPorch']
test['qual_pool'] = test['OverallQual'] * test['PoolArea']
test['qual_mo'] = test['OverallQual'] * test['MoSold']
test['qual_yr'] = test['OverallQual'] * test['YrSold']
test['total_sqft'] = test['GrLivArea'] + test['TotalBsmtSF']
test['total_bathrooms'] = test['FullBath'] + (0.5 * test['HalfBath']) + test['BsmtFullBath'] + (0.5 * test['BsmtHalfBath'])
test['house_age'] = test['YrSold'] - test['YearBuilt']
test['remod_age'] = test['YrSold'] - test['YearRemodAdd']
test['price_per_sqft'] = test['total_sqft'] * test['OverallQual']
test['garage_age'] = test['GarageYrBlt'] - test['YearBuilt']
test['total_porch'] = test['OpenPorchSF'] + test['EnclosedPorch'] + test['3SsnPorch'] + test['ScreenPorch']
test['has_pool'] = (test['PoolArea'] > 0).astype(int)
test['has_garage'] = (test['GarageArea'] > 0).astype(int)
test['has_basement'] = (test['TotalBsmtSF'] > 0).astype(int)
test['total_area'] = test['total_sqft'] + test['total_porch']
test['quality_score'] = test['OverallQual'] * test['OverallCond']

In [5]:
# Separate target variable and concatenate datasets for imputation
train_target = train.pop('SalePrice')
train_target = np.log1p(train_target)
combinedSet = pd.concat([train, test], axis=0, ignore_index=True)

In [6]:
# Separate transformed data back into train and test
train = combinedSet.iloc[:len(train_target)].copy()
test = combinedSet.iloc[len(train_target):].copy()

In [7]:
# Select categorical and numerical columns
categorical_cols = combinedSet.select_dtypes(include=['object']).columns.tolist()
numerical_cols = combinedSet.select_dtypes(include=['number']).columns.tolist()

In [8]:
# Define the transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder',  OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [9]:
# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [10]:
# Apply the preprocessor to the data
train_processed = preprocessor.fit_transform(train)
test_processed = preprocessor.transform(test)

In [11]:
# Split processed training data for validation
X_train, X_valid, y_train, y_valid = train_test_split(train_processed, train_target, test_size=0.2, random_state=42)

In [None]:
params, value = studier(500)

[I 2024-11-15 18:32:28,571] A new study created in memory with name: We the best house


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-11-15 18:32:29,483] Trial 0 finished with value: 0.1496005525608685 and parameters: {'max_depth': 5, 'min_child_weight': 4, 'gamma': 0.8449471207585706, 'reg_lambda': 1.8728360393502075, 'reg_alpha': 0.10634123314246335, 'subsample': 0.7975942942635312, 'colsample_bytree': 0.7856388186976453, 'learning_rate': 0.0436178433796284}. Best is trial 0 with value: 0.1496005525608685.
[I 2024-11-15 18:32:30,860] Trial 1 finished with value: 0.14178373716320952 and parameters: {'max_depth': 5, 'min_child_weight': 6, 'gamma': 0.42877377032027, 'reg_lambda': 1.1288767934028132, 'reg_alpha': 0.07273239544390298, 'subsample': 0.8369114718212987, 'colsample_bytree': 0.7280730210282248, 'learning_rate': 0.028414634063039845}. Best is trial 1 with value: 0.14178373716320952.
[I 2024-11-15 18:32:32,293] Trial 2 finished with value: 0.1521579982588804 and parameters: {'max_depth': 5, 'min_child_weight': 5, 'gamma': 0.944713980937543, 'reg_lambda': 3.0351778545075385, 'reg_alpha': 0.1348353583464

In [None]:
import cupy as cp
final_model = XGBRegressor(**params,random_state=42 ,device='cuda',tree_method='hist',early_stopping_rounds= 30,n_estimators= 10000, )
cpXt = cp.array(X_train)
cpYt = cp.array(y_train)
cpXv = cp.array(X_valid)
cpYv = cp.array(y_valid)
final_model.fit(
    cpXt,
    cpYt,
    eval_set=[(cpXt, cpYt), (cpXv, cpYv)],
)
testCp = cp.array(test_processed)
predictions = np.expm1(final_model.predict(testCp))
final_pred = pd.DataFrame(predictions)
submission = pd.concat([test_ids, final_pred], axis=1)
submission.columns = ['id', 'SalePrice']
submission.to_csv('submissions/submission.csv', index=False)

In [None]:
print(params)
best_so_far = params
best_so_far = pd.DataFrame(best_so_far, index=[0])
best_so_far.to_json('best_so_farv2.json',orient='records')