This assignment is inspired by: 

- https://www.kaggle.com/code/carlmcbrideellis/an-introduction-to-xgboost-regression
- https://www.kaggle.com/code/dansbecker/xgboost/notebook

In this assignment we will apply XGBoost Regression techniques to predict house prices, based on the famous Kaggle Dataset https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques


In [None]:
#=========================================================================
# load up the libraries
#=========================================================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import scipy.stats as stats
import numpy as np

# Change settings for viewing records 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#=========================================================================
# read in the data
#=========================================================================
train_data = pd.read_csv('train.csv',index_col=0)
test_data  = pd.read_csv('test.csv',index_col=0)

# Feature Exploration, Selection and Model Building

In [None]:
## Explore the data 

# print(train_data.head())
# print(train_data.info())
# print(train_data.describe())

# Divide into train and validate (80 train and 20 validate)
train_df, validate_df = train_test_split(train_data, test_size=0.2, random_state=42)


# Drop columns with missing values > 10%
missing_threshold = 0.1 * len(train_df)
train_df = train_df.dropna(thresh=missing_threshold, axis=1)
validate_df = validate_df[train_df.columns]


# Retain columns that are statistically significant (T-test for categorical columns and correlation for numerical columns)
# T-test for categorical columns
categorical_cols = train_df.select_dtypes(include='object').columns
significant_categorical_cols = []
for col in categorical_cols:
    t_stat, p_value = stats.ttest_ind(train_df[train_df['SalePrice'] == 0][col], train_df[train_df['SalePrice'] == 1][col])
    if p_value < 0.05:
        significant_categorical_cols.append(col)



# Correlation for numerical columns
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = train_df[numerical_cols].corr()
significant_numerical_cols = [col for col in numerical_cols if abs(corr_matrix['SalePrice'][col]) > 0.1]

significant_cols = significant_categorical_cols + significant_numerical_cols


train_df = train_df[significant_cols]
validate_df = validate_df[significant_cols]

# # Separate features and target variable
X_train = train_df.drop('SalePrice', axis=1)
y_train = train_df[['SalePrice']]
X_validate = validate_df.drop('SalePrice', axis=1)
y_validate = validate_df[['SalePrice']]

# Train XGBoost model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Test the model on the validate set
y_pred = xgb_model.predict(X_validate)

# Evaluate model performance
mse = mean_squared_error(y_validate, y_pred)
print(f'Mean Squared Error on Validate Set: {mse}')

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

grid_search = GridSearchCV(XGBRegressor(), param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_

# Retrain the model on the entire train dataset (train + validate) with the best hyperparameters
best_xgb_model = XGBRegressor(**best_params)

X_combined = pd.concat([X_train, X_validate], ignore_index=True)
y_combined = pd.concat([y_train, y_validate], ignore_index=True)
best_xgb_model.fit(X_combined, y_combined)


# Predict on Test Data Set

In [None]:
test_data = test_data[test_data.columns[test_data.columns.isin(X_combined.columns)]]
test_data.info()
test_data['BsmtFinSF1'].astype('float64')
test_data['BsmtUnfSF'].astype('float64')
test_data['TotalBsmtSF'].astype('float64')
test_data['BsmtFullBath'].astype('float64')
y_test = best_xgb_model.predict(test_data)
y_test = pd.DataFrame(y_test)
y_test.columns = ['Predicted_Sales_Price']
y_test.to_csv('submission.csv', index=False)