# House Prices - Advanced Regression Techniques
### Using XGBOOST

## ─────────────────────────────────────────────────────────

### 데이터 로드 & 타겟 변수 분리(SalePrice)

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

y = train['SalePrice']
train = train.drop('SalePrice', axis=1)

## ─────────────────────────────────────────────────────────

### 불필요한 칼럼 제거, 유사한 칼럼 합치기

In [4]:
chk_list = ["Id", "Alley", "PoolQC", "Fence", "MiscFeature", "FireplaceQu"]
train = train.drop(chk_list, axis=1)
test = test.drop(chk_list, axis=1)


chk_list2 = [
    'YrSold', 'YearBuilt', 'YearRemodAdd',
    'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath',
    'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF',
    'TotRmsAbvGrd', 'TotalBsmtSF','BsmtFinSF1','BsmtFinSF2','OpenPorchSF','EnclosedPorch','ScreenPorch','3SsnPorch','WoodDeckSF'
    ,'TotalBsmtSF','1stFlrSF','2ndFlrSF','GarageArea','Street','MiscVal','PoolArea'
]
train["HouseAge"] = train["YrSold"] - train["YearBuilt"]
train["RemodAge"] = train["YrSold"] - train["YearRemodAdd"]
train['TotalBathrooms'] = (train['FullBath'] +(0.5 * train['HalfBath']) +train['BsmtFullBath'] +(0.5 * train['BsmtHalfBath']))
train['TotalPorchSF'] = (train['OpenPorchSF'] + train['3SsnPorch'] + train['EnclosedPorch'] + train['ScreenPorch'] +train['WoodDeckSF'])
train['TotalRooms'] = train['TotRmsAbvGrd'] + train['TotalBsmtSF']
train['TotalBsmnt'] = train['BsmtFinSF1'] + train['BsmtFinSF2']
train['NEW_PorchArea'] = (train['OpenPorchSF'] + 
                            train['EnclosedPorch'] + 
                            train['ScreenPorch'] + 
                            train['3SsnPorch'] + 
                            train['WoodDeckSF'])
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']


test["HouseAge"] = test["YrSold"] - test["YearBuilt"]
test["RemodAge"] = test["YrSold"] - test["YearRemodAdd"]
test['TotalBathrooms'] = (test['FullBath'] +(0.5 * test['HalfBath']) +test['BsmtFullBath'] +(0.5 * test['BsmtHalfBath']))
test['TotalPorchSF'] = (test['OpenPorchSF'] + test['3SsnPorch'] + test['EnclosedPorch'] + test['ScreenPorch'] +test['WoodDeckSF'])
test['TotalRooms'] = test['TotRmsAbvGrd'] + test['TotalBsmtSF']
test['TotalBsmnt'] = test['BsmtFinSF1'] + test['BsmtFinSF2']
test['NEW_PorchArea'] = (test['OpenPorchSF'] + 
                            test['EnclosedPorch'] + 
                            test['ScreenPorch'] + 
                            test['3SsnPorch'] + 
                            test['WoodDeckSF'])
test['TotalSF'] = test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

train.drop(chk_list2, axis=1)
test.drop(chk_list2, axis=1)



Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,SaleType,SaleCondition,HouseAge,RemodAge,TotalBathrooms,TotalPorchSF,TotalRooms,TotalBsmnt,NEW_PorchArea,TotalSF
0,20,RH,80.0,11622,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,WD,Normal,49,49,1.0,260,887.0,612.0,260,1778.0
1,20,RL,81.0,14267,IR1,Lvl,AllPub,Corner,Gtl,NAmes,...,WD,Normal,52,52,1.5,429,1335.0,923.0,429,2658.0
2,60,RL,74.0,13830,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,WD,Normal,13,12,2.5,246,934.0,791.0,246,2557.0
3,60,RL,78.0,9978,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,...,WD,Normal,12,12,2.5,396,933.0,602.0,396,2530.0
4,120,RL,43.0,5005,IR1,HLS,AllPub,Inside,Gtl,StoneBr,...,WD,Normal,18,18,2.0,226,1285.0,263.0,226,2560.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,WD,Normal,36,36,1.5,0,551.0,0.0,0,1638.0
1455,160,RM,21.0,1894,Reg,Lvl,AllPub,Inside,Gtl,MeadowV,...,WD,Abnorml,36,36,1.5,24,552.0,252.0,24,1638.0
1456,20,RL,160.0,20000,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,WD,Abnorml,46,10,2.0,474,1231.0,1224.0,474,2448.0
1457,85,RL,62.0,10441,Reg,Lvl,AllPub,Inside,Gtl,Mitchel,...,WD,Normal,14,14,1.5,112,918.0,337.0,112,1882.0


## ─────────────────────────────────────────────────────────

### NULL 데이터 50% 이상이면 항목제거, 범주형 데이터처리, 결측값 정리, 공통된 데이터프레임만 남게 정리

In [5]:
threshold = 0.5
train_null_ratio = train.isnull().mean()
test_null_ratio = test.isnull().mean()

train = train.loc[:, train_null_ratio < threshold]
test = test.loc[:, test_null_ratio < threshold]

train = pd.get_dummies(train)
test = pd.get_dummies(test)

train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

train, test = train.align(test, join='inner', axis=1)


## ─────────────────────────────────────────────────────────

### 학습데이터, 실험데이터 분리, 최적의 파라미터를 구해서 계산

In [7]:
X = train
y = y
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

param = {
    'max_depth': [2, 3, 4, 5, 6],
    'n_estimators': range(550, 700, 1000),
    'colsample_bytree': [0.5, 0.7, 1],
    'colsample_bylevel': [0.5, 0.7, 1],
    'learning_rate': [0.1, 0.05, 0.01]
}

model = XGBRegressor()
grid_search = GridSearchCV(estimator=model, param_grid=param, 
                           scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Estimator:", grid_search.best_estimator_)

Best Parameters: {'colsample_bylevel': 0.5, 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 550}
Best Estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=0.5, colsample_bynode=None, colsample_bytree=0.7,
             device=None, early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=550,
             n_jobs=None, num_parallel_tree=None, random_state=None, ...)


## ─────────────────────────────────────────────────────────

### 최적의 모델로 계산한 점수 및 제출파일 생성

In [9]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'Validation RMSE: {rmse}')

test_predictions = best_model.predict(test)

submission = pd.DataFrame({
    'Id': pd.read_csv('test.csv')['Id'],
    'SalePrice': test_predictions
})

submission.to_csv('submission_best.csv', index=False)

Validation RMSE: 23653.792375060842


## ─────────────────────────────────────────────────────────