In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# All necessary imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Scikit learn
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


# Load training data

In [2]:
# Load full training data as a dataframe
input_train_file = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
fulltrain_df = pd.read_csv(input_train_file)
print(f'Full training data shape: {fulltrain_df.shape}') 

Full training data shape: (1460, 81)


# Preprocess data

## Preprocessing functions


In [3]:
def clean_data(df):
    """
    This function removes columns from training that have >80% NaN values.
    You can play with the percentage of NaN values to see what works best.
    """
    samples = df.shape[0] # number of samples in the data
    cols_to_be_dropped = [] # columns with majority null values
    
    # Remove columns from training that have >80% missing columns
    for c in df.columns.to_list():
        num_nulls = df[c].isnull().sum() # calculate the sum of null values
        percent_nulls = num_nulls/samples
        if percent_nulls > 0.8:
            cols_to_be_dropped.append(c)
    print(f'Columns that have more than 80% NaN values:{cols_to_be_dropped}')
    df = df.drop(cols_to_be_dropped, axis=1)
    return df
    
def handle_skewness(skew, df):
    """
    This function handles skewness in data and performs log transformation on features only.
    ** Not being used in preprocessing as it didn't help in accuracy
    """
    # Log transform for features with skewness > 2
    for col in df.columns.to_list():
        if skew[col] > 2:  # For positively skewed
            df[col] = np.log1p(df[col])  # log1p handles 0 and positive values (log(1+x))
        elif skew[col] < -2:  # For negatively skewed
            min_val = abs(df[col].min()) + 1  # shift the column to make all values positive
            df[col] = np.log1p(df[col] + min_val)  # log1p after shifting 
    return df

def visualize_and_remove_outliers(df, Y):
    """
    This function plots the scatterplots for every feature against the label.
    I used it to visually filter out outliers from some of the features.
    ** Might be buggy but works as a weak checker
    """
    numerical_cols = df.select_dtypes(exclude=['object']).columns
    n_cols = 6
    n_features = len(numerical_cols)  # total number of features
    n_rows = (n_features // n_cols)  # compute number of rows (ceiling division)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(25,20), constrained_layout=True)
    axes = axes.flatten() # flatten to 1D
    print(n_rows, n_cols, n_features)
    
    for i, col in enumerate(numerical_cols):
        ax = axes[i]
        clean_data = pd.concat([df[col], Y], axis=1).dropna()
        ax.scatter(clean_data[col], clean_data[Y.name], alpha=0.7, edgecolor='k')
        ax.set_title(f"{col} vs Y", fontsize=10)
        ax.set_xlabel(col, fontsize=8)
        ax.set_ylabel("Y", fontsize=8)
        ax.tick_params(axis='both', labelsize=8)

    plt.suptitle("Scatterplots of Features vs Target", fontsize=14)
    plt.show()

def remove_outliers(df):
    """
    This function removes the visually inspected outliers.
    The list of the outliers is given in the cell below.
    """
    conditions = ( 
        (df['BsmtFinSF2'] <= 1200) &  
        (df['BsmtFinSF1'] <= 3000) &  
        (df['MasVnrArea'] <= 1200) &  
        (df['EnclosedPorch'] <= 400) &
        (df['GrLivArea']<=4000)
    )

    filtered_df = df[conditions].reset_index(drop=True) 
    return filtered_df
    


In [4]:
# On visual inspection here are the outliers:
'''
Only numerical features:
1. BsmtFinSF2 > 1200
2. BsmtFinSF1 > 3000
3. MasVnrArea > 1200
4. EnclosedPorch > 400
5. GrLivArea > 4000
'''   

def preprocess(fulltrain_df, training):
    """
    This function pre-processes the data by encoding categoricals and imputing NaN values using target encoding
    """
    # visualize_outliers(fulltrain_df) # visulaize outliers for each feature - helper function: not to be used in general training
    # visualize_and_remove_outliers(fulltrain_df, Y)

    # Remove outliers
    if training:
        fulltrain_df = remove_outliers(fulltrain_df)
    
    # Drop columns with >80% null values
    fulltrain_df = clean_data(fulltrain_df)

    # fulltrain_df.info() # info of all feature columns
    categoricals = fulltrain_df.select_dtypes(include='object') # object = categoricals
    categorical_column_names = categoricals.columns # list of categoricals
    # print(categoricals.nunique()) # unique categories for each category
    columns_with_nan = fulltrain_df.columns[fulltrain_df.isna().any()].tolist() # NaN columns to be imputed

    # Encode categoricals
    le = LabelEncoder()
    for col in categorical_column_names.to_list():
        fulltrain_df[col] = le.fit_transform(fulltrain_df[col]) # inplace encode categorical columns
    
    # print(fulltrain_df[categorical_column_names.to_list()].head(1)) # simple check to see encodings
    # if above statement shows an empty dataframe, it's possible the script was run before and cateforicals have already been encoded
    
    # Impute missing values using mean of all other entries (target encoding) - here the categoricals are encoded so the mean approach works
    for column in columns_with_nan:
        mean_value = fulltrain_df[column].mean()  
        fulltrain_df[column].fillna(mean_value, inplace=True)  

    # Check for highly skewed features and transform them if skewness is > abs(2) - not used in training because it didn't help in accuracy
    # skew = fulltrain_df.skew()
    # print(fulltrain_df['SalePrice'])
    # fulltrain_df = handle_skewness(skew, fulltrain_df)
        
    return fulltrain_df



## Apply log transformation and split training/testing data

In [5]:
# Preprocess data
fulltrain_df = preprocess(fulltrain_df, training = True)

# Split labels and features
Y = fulltrain_df['SalePrice']
fulltrain_df.drop(['SalePrice'], inplace=True, axis=1)
X_ids = fulltrain_df['Id'] # save IDs to put in later to match columns of testing data
X = fulltrain_df.drop(columns=['Id'],axis=1) # we don't need Id column for training

Y = np.log1p(Y) # log transformation to convert rightly skewed labels to normal distributions - helps in accuracy
# Can use this to visualize the rightly skewed distribution of Y emphasizing the need for log transormation
# plt.hist(Y, bins=30, edgecolor='k', alpha=0.7)
# plt.title("Histogram of Y")
# plt.xlabel("Y values")
# plt.ylabel("Frequency")
# plt.show()

# Split training/testing for training data before submission
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
print(f'Training data shape: {X_train.shape}')
print(f'Testing data shape: {X_test.shape}')


Columns that have more than 80% NaN values:['Alley', 'PoolQC', 'Fence', 'MiscFeature']
Training data shape: (1155, 75)
Testing data shape: (289, 75)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fulltrain_df[column].fillna(mean_value, inplace=True)


# Training Random Forest model

## Without tuned hyperparameters on train/test split
### just used as a check. The next cell trains the model on the entire training dataset

In [6]:
# Training
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Prediction
y_pred = rf_model.predict(X_test)
# We don't need to "untransform" our predictions here because Y was divided into training and testing sets, so the predictions will be log transformed and so will the testing labels
print(f'r2 test score (before tuning hyperparameters): {r2_score(y_test,y_pred)}') 

r2 test score (before tuning hyperparameters): 0.9007953416565198


## With tuned hyperparameters on full train data

In [7]:
# Randomized search for hyperparameters
rf_model = RandomForestRegressor() # redefine model object
random_grid = {'bootstrap': [True, False], # domain to search 
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [130, 180, 230]}

# Search across (n_iters * cv)  different combinations, and use all available cores
rf_model = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_model.fit(X, Y)
print(f'Best parameters:')
print(rf_model.best_params_)

# Will eventually implement Bayesian Method 

Fitting 3 folds for each of 20 candidates, totalling 60 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters:
{'n_estimators': 230, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'bootstrap': False}


In [8]:
# Simple check on train/test split to see advantage of hyperparameter tuning
rf_check = RandomForestRegressor(**rf_model.best_params_)
rf_check.fit(X_train, y_train) # remember Y is log transformed i.e Y = log(Y+1)
y_pred = rf_check.predict(X_test)
print(f'r2 test score (after hyperparameter tuning): {r2_score(y_test, y_pred)}') 

r2 test score (after hyperparameter tuning): 0.9062164295873577


# Predict for test data and prepare submission file under /kaggle/working/

In [9]:
# Submission data    
submission_name = 'submission_13' # change as per submission number
input_test_file = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"

# Read and preprocess testing data
submission_df = pd.read_csv(input_test_file)
X['Id'] = X_ids # add already saved Ids to simply match all columns between training and testing data - hacky
submission_df = submission_df[X.columns] # there are some columns that are removed as part of pre-processing in training and maintaining consistency between training and testing sets is needed otherwise an error will be thrown
submission_df = preprocess(submission_df, training = False)


# Predict using trained model
ids = submission_df.pop('Id') # save IDs
y_pred_submission = rf_model.predict(submission_df)

# Prepare csv submission
output = pd.DataFrame({'Id': ids,
                       'SalePrice': np.expm1(y_pred_submission)}) # converting predicted labels back from log transformation
output.to_csv(f'/kaggle/working/{submission_name}.csv', index=False) 
print(f"Check /kaggle/working directory for 'submission' {submission_name}")

Columns that have more than 80% NaN values:[]
Check /kaggle/working directory for 'submission' submission_13


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fulltrain_df[column].fillna(mean_value, inplace=True)
