Predict House Price with lowest root mean squared error (improved version aft kaggle intermediate course)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBRFRegressor

In [None]:
from google.colab import files
uploaded = files.upload()

Saving house-prices-advanced-regression-techniques.zip to house-prices-advanced-regression-techniques.zip


In [None]:
import zipfile

with zipfile.ZipFile('house-prices-advanced-regression-techniques.zip', 'r') as zip_ref:
    zip_ref.extractall('house-prices-data')

In [None]:
#load data

train_df = pd.read_csv('house-prices-data/train.csv')
test_df = pd.read_csv('house-prices-data/test.csv')

In [None]:
#data info

print(train_df.info())
print(train_df.head())
print(train_df.describe())

print("\n",test_df.info())
print(test_df.head())
print(test_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 91 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              1460 non-null   int64  
 1   MSSubClass      1460 non-null   int64  
 2   MSZoning        1460 non-null   object 
 3   LotFrontage     1201 non-null   float64
 4   LotArea         1460 non-null   int64  
 5   Street          1460 non-null   object 
 6   Alley           91 non-null     object 
 7   LotShape        1460 non-null   object 
 8   LandContour     1460 non-null   object 
 9   Utilities       1460 non-null   object 
 10  LotConfig       1460 non-null   object 
 11  LandSlope       1460 non-null   object 
 12  Neighborhood    1460 non-null   object 
 13  Condition1      1460 non-null   object 
 14  Condition2      1460 non-null   object 
 15  BldgType        1460 non-null   object 
 16  HouseStyle      1460 non-null   object 
 17  OverallQual     1460 non-null   i

In [None]:
#Feature Engineering

train_df['TotalSF'] = train_df['TotalBsmtSF'] + train_df['GrLivArea']
train_df['TotalPorchSF'] = train_df['WoodDeckSF'] + train_df['OpenPorchSF'] + train_df['EnclosedPorch'] + train_df['3SsnPorch'] + train_df['ScreenPorch']
train_df['HasPool'] = (train_df['PoolArea'] > 0).astype(int)
train_df['HasDeck'] = (train_df['WoodDeckSF'] > 0).astype(int)
train_df['AgeOfHouse'] = train_df['YrSold'] - train_df['YearBuilt']
train_df['AgeSinceRemod'] = train_df['YrSold'] - train_df['YearRemodAdd']
train_df['HasCentralAir'] = (train_df['CentralAir'] == 'Y').astype(int)
train_df['HasFireplace'] = (train_df['Fireplaces'] > 0).astype(int)
train_df['TotalBathrooms'] = (
    train_df['FullBath'] +
    0.5 * train_df['HalfBath'] +
    train_df['BsmtFullBath'] +
    0.5 * train_df['BsmtHalfBath']
)
train_df['GarageCarsCat'] = train_df['GarageCars'].apply(lambda x: x if x < 3 else 3)

# Add same features to test_df

test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['GrLivArea']
test_df['TotalPorchSF'] = test_df['WoodDeckSF'] + test_df['OpenPorchSF'] + test_df['EnclosedPorch'] + test_df['3SsnPorch'] + test_df['ScreenPorch']
test_df['HasPool'] = (test_df['PoolArea'] > 0).astype(int)
test_df['HasDeck'] = (test_df['WoodDeckSF'] > 0).astype(int)
test_df['AgeOfHouse'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['AgeSinceRemod'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df['HasCentralAir'] = (test_df['CentralAir'] == 'Y').astype(int)
test_df['HasFireplace'] = (test_df['Fireplaces'] > 0).astype(int)
test_df['TotalBathrooms'] = (
    test_df['FullBath'] +
    0.5 * test_df['HalfBath'] +
    test_df['BsmtFullBath'] +
    0.5 * test_df['BsmtHalfBath']
)
test_df['GarageCarsCat'] = test_df['GarageCars'].apply(lambda x: x if x < 3 else 3)

In [None]:
#Split
X = train_df.drop('SalePrice', axis = 1)
y= train_df['SalePrice']

X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(X, y, train_size= 0.8,random_state = 42)

In [None]:
#target encoding for high cardinity cols

high_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() >= 10 and
                        X_train_full[cname].dtype == "object"]
print(high_cardinality_cols)

def target_encode_smooth(X_train_full, X_valid_full, test_df, y_train_full, cols, m = 100):
  global_mean = y_train_full.mean()

  for col in cols:
    stats = (
        pd.DataFrame({col : X_train_full[col], 'target' : y_train_full})
        .groupby(col)['target']
        .agg(['mean', 'count'])
    )
    stats['smoothed'] = (stats['count'] * stats['mean'] + m * global_mean) / (stats['count'] + m)

    X_train_full[col + "_TE"] = X_train_full[col].map(stats['smoothed'])
    X_valid_full[col + "_TE"] = X_valid_full[col].map(stats['smoothed']).fillna(global_mean)
    test_df[col + "_TE"] = test_df[col].map(stats['smoothed']).fillna(global_mean)

    X_train_full.drop(columns = col, inplace = True)
    X_valid_full.drop(columns = col, inplace = True)
    test_df.drop(columns = col, inplace = True)

  return X_train_full, X_valid_full, test_df

X_train_full, X_valid_full, test_df = target_encode_smooth(
    X_train_full, X_valid_full, test_df, y_train_full,
    cols = high_cardinality_cols,
    m = 100
)

['Neighborhood', 'Exterior1st', 'Exterior2nd']


In [None]:
#clean data

#seperate into numerical and categorical columns
numerical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']]
categorical_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object']

#keep both sets seperately
X_train_num = X_train_full[numerical_cols].copy()
X_valid_num = X_valid_full[numerical_cols].copy()
X_test_num = test_df[numerical_cols].copy()

X_train_cat = X_train_full[categorical_cols].copy()
X_valid_cat = X_valid_full[categorical_cols].copy()
X_test_cat = test_df[categorical_cols].copy()

#clean numerical data
imputer_num = SimpleImputer(strategy = 'median')
X_train_num = pd.DataFrame(imputer_num.fit_transform(X_train_num), columns=numerical_cols, index=X_train_full.index)
X_valid_num = pd.DataFrame(imputer_num.transform(X_valid_num), columns=numerical_cols, index=X_valid_full.index)
X_test_num = pd.DataFrame(imputer_num.transform(X_test_num), columns=numerical_cols, index=test_df.index)

#clean categorical data
imputer_cat = SimpleImputer(strategy= 'most_frequent')
X_train_cat = pd.DataFrame(imputer_cat.fit_transform(X_train_cat), columns=categorical_cols, index=X_train_full.index)
X_valid_cat = pd.DataFrame(imputer_cat.transform(X_valid_cat), columns=categorical_cols, index=X_valid_full.index)
X_test_cat = pd.DataFrame(imputer_cat.transform(X_test_cat), columns=categorical_cols, index = test_df.index)

In [None]:
#encode categorical data

#drop high_cardinity_col
high_cardinality_cols = [cname for cname in X_train_cat.columns if X_train_full[cname].nunique() >= 10 and
                        X_train_full[cname].dtype == "object"]

X_train_cat = X_train_cat.drop(columns=high_cardinality_cols)
X_valid_cat = X_valid_cat.drop(columns=high_cardinality_cols)
X_test_cat  = X_test_cat.drop(columns=high_cardinality_cols)

#one hot encode low cardinity_cols
low_cardinality_cols = [cname for cname in X_train_cat.columns if X_train_full[cname].nunique() < 10 and
                        X_train_full[cname].dtype == "object"]

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cat_encoded = pd.DataFrame(
    encoder.fit_transform(X_train_cat),
    index = X_train_cat.index,
    columns = encoder.get_feature_names_out(X_train_cat.columns)
)

X_valid_cat_encoded = pd.DataFrame(
    encoder.transform(X_valid_cat),
    index = X_valid_cat.index,
    columns = encoder.get_feature_names_out(X_valid_cat.columns)
)

X_test_cat_encoded = pd.DataFrame(
    encoder.transform(X_test_cat),
    index = X_test_cat.index,
    columns = encoder.get_feature_names_out(X_test_cat.columns)
)


In [None]:
# combine numerical and categorical data

X_train_final = pd.concat([X_train_num, X_train_cat_encoded], axis = 1)
X_valid_final = pd.concat([X_valid_num, X_valid_cat_encoded], axis = 1)
X_test_final = pd.concat([X_test_num, X_test_cat_encoded], axis = 1)

In [None]:
#scaling

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_valid_scaled = scaler.transform(X_valid_final)
X_test_scaled = scaler.transform(X_test_final)

In [None]:
#modelling

model = {
    "Linear Regression" : LinearRegression(),
    "Random Forest" : RandomForestRegressor(random_state = 0),
    "XGBoost" : XGBRFRegressor(n_estimators = 100, learning_state = 0.1, random_state = 0)
}

# def function to measure quality of each appproach
def score_dataset(model, X_train, X_valid, y_train, y_valid):
  model.fit(X_train, y_train)
  preds = model.predict(X_valid)
  return root_mean_squared_error(y_valid, preds)

for name, model in model.items():
  rmse = score_dataset(model, X_train_scaled, X_valid_scaled, y_train_full, y_valid_full)
  print(f"{name} RMSE : {rmse : .4f}")

Linear Regression RMSE :  66065.1791
Random Forest RMSE :  29683.6007


Parameters: { "learning_state" } are not used.



XGBoost RMSE :  31956.9277


In [None]:
#Test

test_model = RandomForestRegressor(random_state = 0)
test_model.fit(X_train_scaled, y_train_full)

test_pred = test_model.predict(X_test_scaled)

submission = pd.DataFrame({
    'Id': test_df['Id'],  # for house prices, the ID column is 'Id'
    'SalePrice': test_pred # target is SalePrice
})

submission.to_csv('submission.csv', index=False)