01 - Baseline Encoding Benchmark Code (Starting Code)

This Notebook is created for implementing the baseline categorical encoding benchmark for regression datasets. 

1. First step to setup the environment is to import the necessary tools.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn imports
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# category encoders import
import category_encoders as ce


Now we have to load and prepare the data using try block, so we could find out the error, if there is one.

In [26]:
def load_data():
    # Loading and preparing ames dataset
    try:
        train_data = pd.read_csv("../data/raw/ames-housing-dataset/AmesHousing.csv", index_col=0)

        # preprocessing steps
        # filling the categorical features missing values
        categorical_columns = train_data.select_dtypes(include="object").columns.tolist()
        train_data[categorical_columns] = train_data[categorical_columns].fillna("missing")

        # filling the numerical features missing values
        numerical_columns = train_data.select_dtypes(include="number").columns.to_list()
        train_data[numerical_columns] = train_data[numerical_columns].fillna(-1)  

        # excluding the useless columns from train data
        columns_to_exclude = ["Order", "PID", "SalePrice"]
        selected_columns = [col for col in train_data.columns if col not in columns_to_exclude]

        # Defining X and y (input and target value)
        X = train_data[selected_columns]
        y = train_data["SalePrice"]

        # return
        return X, y, categorical_columns, numerical_columns

    except FileNotFoundError:
        print("Dataset was not found!")
        return None
load_data()

(       MS SubClass MS Zoning  Lot Frontage  Lot Area Street    Alley  \
 Order                                                                  
 1               20        RL         141.0     31770   Pave  missing   
 2               20        RH          80.0     11622   Pave  missing   
 3               20        RL          81.0     14267   Pave  missing   
 4               20        RL          93.0     11160   Pave  missing   
 5               60        RL          74.0     13830   Pave  missing   
 ...            ...       ...           ...       ...    ...      ...   
 2926            80        RL          37.0      7937   Pave  missing   
 2927            20        RL          -1.0      8885   Pave  missing   
 2928            85        RL          62.0     10441   Pave  missing   
 2929            20        RL          77.0     10010   Pave  missing   
 2930            60        RL          74.0      9627   Pave  missing   
 
       Lot Shape Land Contour Utilities Lot Confi