From reviewing other transformer based tabular learners, we seem to be missing out on a big advantage that comes from how we embed the data. 

In [54]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [55]:
df = pd.read_csv('../../datasets/housing/housing.csv')
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [56]:
df = df.dropna()

In [57]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

ocean_proximity needs to be encoded

In [58]:
le = LabelEncoder()
df['ocean_proximity'] = le.fit_transform(df['ocean_proximity'])

In [59]:
y = df['median_house_value']
X = df.drop(['median_house_value'], axis=1)

In [60]:
def categorize_columns(dataframe):
    categorical_columns = []
    continuous_columns = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object' or len(dataframe[column].unique()) <= 10:
            # If the column's data type is 'object' or it has 10 or fewer unique values, consider it categorical.
            categorical_columns.append(column)
        else:
            # Otherwise, consider it continuous.
            continuous_columns.append(column)

    # Calculate the total number of unique classes across all categorical columns.
    total_unique_classes = sum(dataframe[col].nunique() for col in categorical_columns)

    return categorical_columns, continuous_columns, total_unique_classes


cat_cols, cont_cols, total_unique = categorize_columns(X)
print(cat_cols)
print(cont_cols)
print(total_unique)

['ocean_proximity']
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
5


In [61]:
# First, let's split the data into training (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Next, we'll split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

stand = StandardScaler()

# The transformers argument specifies which columns to apply the scaler to
# and which columns to leave unchanged
ct = ColumnTransformer(
    transformers=[
        ('stand', stand, cont_cols)  # ('name', transformer, columns)
    ],
    remainder='passthrough'  # Leave other columns unchanged
)

# Fit and transform the training data
df_train_scaled = ct.fit_transform(X_train)


ct.fit(X_train)
df_train_scaled = ct.transform(X_train)
df_val_scaled = ct.transform(X_val)
df_test_scaled = ct.transform(X_test)

df_train = pd.DataFrame(df_train_scaled, columns=X_train.columns)
df_val = pd.DataFrame(df_val_scaled, columns=X_val.columns)
df_test = pd.DataFrame(df_test_scaled, columns=X_test.columns)

train_target=y_train.reset_index(drop=True)
val_target=y_val.reset_index(drop=True)
test_target=y_test.reset_index(drop=True)

df_train['median_house_value'] = train_target
df_val['median_house_value'] = val_target
df_test['median_house_value'] = test_target

In [62]:
X_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
13778,-117.06,34.02,24.0,3912.0,809.0,1926.0,762.0,2.6875,1
12764,-121.40,38.61,33.0,3512.0,825.0,1515.0,782.0,1.9908,1
15712,-122.44,37.79,52.0,1726.0,384.0,614.0,356.0,3.6812,3
3725,-118.42,34.18,31.0,2887.0,646.0,1626.0,637.0,3.6745,0
13189,-117.71,34.08,29.0,1276.0,283.0,1216.0,316.0,2.5972,1
...,...,...,...,...,...,...,...,...,...
11397,-117.97,33.72,24.0,2991.0,500.0,1437.0,453.0,5.4286,0
12081,-117.54,33.76,5.0,5846.0,1035.0,3258.0,1001.0,4.7965,0
5447,-118.42,34.01,42.0,1594.0,369.0,952.0,362.0,3.0990,0
866,-122.04,37.57,12.0,5719.0,1064.0,3436.0,1057.0,5.2879,3


In [63]:
#save csvs
df_train.to_csv('../housing/data/train.csv', index=False)
df_val.to_csv('../housing/data/validation.csv', index=False)
df_test.to_csv('../housing/data/test.csv', index=False)