In [4]:
!pip install xgboost



In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
import xgboost as xgb

def fill_na(df):
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna('None')
    return df

def remove_outliers(df):
    return df.drop(df[(df['GrLivArea'] > 4000) & (df['SalePrice'] < 300000)].index)

def encode_ordinals(df):
    quality_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
    for col in ['ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'BsmtQual', 'BsmtCond']:
        if col in df.columns:
            df[col] = df[col].map(quality_map).fillna(0)
    return df

def gen_features(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath'] + df['BsmtFullBath'] + 0.5*df['BsmtHalfBath']
    df['HouseAge'] = df['YrSold'] - df['YearBuilt']
    df['RemodelAge'] = df['YrSold'] - df['YearRemodAdd']
    return df

train = pd.read_csv(r"C:\Users\bhagy\Downloads\house-prices-advanced-regression-techniques\train.csv")
test = pd.read_csv(r"C:\Users\bhagy\Downloads\house-prices-advanced-regression-techniques\test.csv")
test_id = test['Id']
train = remove_outliers(train)
y = np.log1p(train['SalePrice'])
train.drop(['SalePrice', 'Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)
df = pd.concat([train, test], ignore_index=True)
df = fill_na(df)
df = encode_ordinals(df)
df = gen_features(df)
df = pd.get_dummies(df, drop_first=True)
n_train = y.shape[0]
X = df[:n_train]
X_test = df[n_train:]
model = Pipeline([
    ('scaler', StandardScaler()),
    ('reg', xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.05, max_depth=3, random_state=42))
])
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = -cross_val_score(model, X, y, scoring='neg_root_mean_squared_error', cv=kf)
print("Average CV RMSE:", scores.mean())
model.fit(X, y)
preds = np.expm1(model.predict(X_test))
submission = pd.DataFrame({'Id': test_id, 'SalePrice': preds})
submission.to_csv('submission.csv', index=False)
print("Submission saved to 'submission.csv'")


Average CV RMSE: 0.1292382094615254
Submission saved to 'submission.csv'
