In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Train-Test Splits

Let's first inspect the data. We will use the Ames Housing Data.

There are 1379 rows in the data, with 80 columns.

There are many different features listed, like property size, number of rooms and neighborhood information. The target variable is `SalePrice`.

The distribution of datatypes is:
- **float** => 21
- **int** => 16
- **object** (categorical) => 43

In [None]:
data = pd.read_csv("./data/Ames_Housing_Sales.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1379 entries, 0 to 1378
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   1stFlrSF       1379 non-null   float64
 1   2ndFlrSF       1379 non-null   float64
 2   3SsnPorch      1379 non-null   float64
 3   Alley          82 non-null     object 
 4   BedroomAbvGr   1379 non-null   int64  
 5   BldgType       1379 non-null   object 
 6   BsmtCond       953 non-null    object 
 7   BsmtExposure   953 non-null    object 
 8   BsmtFinSF1     1379 non-null   float64
 9   BsmtFinSF2     1379 non-null   float64
 10  BsmtFinType1   953 non-null    object 
 11  BsmtFinType2   952 non-null    object 
 12  BsmtFullBath   1379 non-null   int64  
 13  BsmtHalfBath   1379 non-null   int64  
 14  BsmtQual       953 non-null    object 
 15  BsmtUnfSF      1379 non-null   float64
 16  CentralAir     1379 non-null   object 
 17  Condition1     1379 non-null   object 
 18  Conditio

In [None]:
data.dtypes.value_counts()

object     43
float64    21
int64      16
Name: count, dtype: int64

### One-Hot Encoding

For the categorical columns we want to use one-hot encoding. For this additional columns will be created signifying whether or not (with a True/False label) that category is applicable to the datapoint. This will create additional columns which, depending on the amount of categories present, might increase the feature space dimensionality significantly. Luckily, we can easily inspect how many of these columns will be created beforehand by counting the number of categories.

In [None]:
columns_categorical = data.columns[data.dtypes == object]
# Create a Series of the counts of the number of categories in each categorical variable
num_additional_columns = data[columns_categorical].apply(lambda x: x.nunique(dropna=False)).sort_values(ascending=False)
# If there are categorical variables with only one category, they don't need to be encoded
num_additional_columns = num_additional_columns.loc[num_additional_columns>1]
# Subtract 1 since we take one category as the baseline to avoid collinearity
num_additional_columns -= 1
# And let's inspect the total amount of new columns after applying one-hot encoding
f"There will be {num_additional_columns.sum()} additional columns"

'There will be 215 additional columns'

We can now perform one-hot encoding and build a new dataframe with the encoded columns while removing the original category columns.

In [None]:
# Copy of the data
data_ohc = data.copy()

# No sparse output since we want to incorporate the encoded values directly into our DF
# Also, skip first to avoid collinearity
ohc = OneHotEncoder(sparse_output=False, drop='first')

# Transform all categorical columns
encoded = ohc.fit_transform(data_ohc[num_additional_columns.index])
encoded_df = pd.DataFrame(
    encoded,
    index=data_ohc.index,
    columns=ohc.get_feature_names_out(num_additional_columns.index)
)
data_ohc = pd.concat(
    [data_ohc.drop(num_additional_columns.index, axis=1), encoded_df],
    axis=1
) 

In [None]:
# we also need to remove the string columns from the original data since a linear regression model does not 
# know how to handle this
data = data.drop(num_additional_columns.index, axis=1)

### Modeling
#### Splitting the data into test/train

We now have two datasets, one with one-hot encoding and one without. Let's see the performance of both of them. For this we first need to split them up the same way (which is possible by choosing a random seed using the `random_state` parameter of the `train_test_split()` function). Then we can fit both datasets using the same model and evaluate performance.

In [None]:
from sklearn.model_selection import train_test_split

X_data = data.drop('SalePrice', axis=1)
y_data = data['SalePrice']

X_data_oh = data_ohc.drop('SalePrice', axis=1)
y_data_oh = data_ohc['SalePrice']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=42, test_size=0.3)
X_train_oh, X_test_oh, y_train_oh, y_test_oh = train_test_split(X_data_oh, y_data_oh, random_state=42, test_size=0.3)

Let's make sure the data got split the same way by comparing the indices:

In [None]:
f"The indices of the splits are the same: {(X_train_oh.index == X_train.index).all()}"

'The indices of the splits are the same: True'

#### Fitting the model

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
error_values = []

lr = lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test) 

error_values.append(pd.Series({'train': mean_squared_error(y_train, y_train_pred),
                           'test' : mean_squared_error(y_test,  y_test_pred)},
                           name='no enc'))

lr = lr.fit(X_train_oh, y_train_oh)
y_train_pred_oh = lr.predict(X_train_oh)
y_test_pred_oh = lr.predict(X_test_oh) 

error_values.append(pd.Series({'train': mean_squared_error(y_train_oh, y_train_pred_oh),
                           'test' : mean_squared_error(y_test_oh,  y_test_pred_oh)},
                           name='oh enc'))

err = pd.concat(error_values, axis=1)
err

Unnamed: 0,no enc,oh enc
train,1131507000.0,317726700.0
test,1372182000.0,4964292000.0
