From reviewing other transformer based tabular learners, we seem to be missing out on a big advantage that comes from how we embed the data. 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../../datasets/housing/housing.csv')
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

In [4]:
df = df.dropna()

In [5]:
df.isna().any()

longitude             False
latitude              False
housing_median_age    False
total_rooms           False
total_bedrooms        False
population            False
households            False
median_income         False
median_house_value    False
ocean_proximity       False
dtype: bool

ocean_proximity needs to be encoded

In [6]:
le = LabelEncoder()
df['ocean_proximity'] = le.fit_transform(df['ocean_proximity'])

In [7]:
y = df['median_house_value']
X = df.drop(['median_house_value'], axis=1)

In [8]:
def categorize_columns(dataframe):
    categorical_columns = []
    continuous_columns = []

    for column in dataframe.columns:
        if dataframe[column].dtype == 'object' or len(dataframe[column].unique()) <= 10:
            # If the column's data type is 'object' or it has 10 or fewer unique values, consider it categorical.
            categorical_columns.append(column)
        else:
            # Otherwise, consider it continuous.
            continuous_columns.append(column)

    # Calculate the total number of unique classes across all categorical columns.
    total_unique_classes = sum(dataframe[col].nunique() for col in categorical_columns)

    return categorical_columns, continuous_columns, total_unique_classes


cat_cols, cont_cols, total_unique = categorize_columns(X)
print(cat_cols)
print(cont_cols)
print(total_unique)

['ocean_proximity']
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
5


In [23]:
# First, let's split the data into training (60%) and temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Next, we'll split temp into validation (20%) and test (20%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

stand = StandardScaler()

# transforms specific columns to standardize the continuous variables
ct = ColumnTransformer(
    transformers=[
        ('stand', stand, cont_cols)  # ('name', transformer, columns)
    ],
    remainder='passthrough'  # Leave other columns unchanged
)

# stardardize train values
ct.fit(X_train)
df_train_scaled = ct.transform(X_train)
df_val_scaled = ct.transform(X_val)
df_test_scaled = ct.transform(X_test)

df_train = pd.DataFrame(df_train_scaled, columns=X_train.columns)
df_val = pd.DataFrame(df_val_scaled, columns=X_val.columns)
df_test = pd.DataFrame(df_test_scaled, columns=X_test.columns)


# stardardize target values
def standardize_series(s):
    mean = s.mean()
    std_dev = s.std()
    return ((s - mean) / std_dev), mean, std_dev


y_train, train_mean, train_std = standardize_series(y_train)
y_val, val_mean, val_std = standardize_series(y_val)
y_test, test_mean, test_std = standardize_series(y_test)

train_target=y_train.reset_index(drop=True)
val_target=y_val.reset_index(drop=True)
test_target=y_test.reset_index(drop=True)

df_train['median_house_value'] = train_target
df_val['median_house_value'] = val_target
df_test['median_house_value'] = test_target

In [18]:
df_train

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,1.261189,-0.764261,-0.358478,0.584211,0.643412,0.431828,0.688976,-0.627034,1.0,-0.786025
1,-0.906340,1.384381,0.355019,0.401362,0.681233,0.077837,0.741252,-0.993331,1.0,-0.764461
2,-1.425748,1.000528,1.861291,-0.415056,-0.361193,-0.698186,-0.372218,-0.104586,3.0,2.523613
3,0.581963,-0.689363,0.196464,0.115662,0.258117,0.173441,0.362254,-0.108109,0.0,1.104704
4,0.936559,-0.736174,0.037909,-0.620761,-0.599934,-0.179689,-0.476769,-0.674510,1.0,-0.630765
...,...,...,...,...,...,...,...,...,...,...
12254,0.806707,-0.904695,-0.358478,0.163202,-0.086995,0.010657,-0.118681,0.814126,0.0,0.569055
12255,1.021462,-0.885971,-1.864750,1.468283,1.177626,1.579069,1.313670,0.481794,0.0,-0.402187
12256,0.581963,-0.768942,1.068516,-0.475396,-0.396649,-0.407070,-0.356535,-0.410684,0.0,1.103841
12257,-1.225976,0.897543,-1.309808,1.410229,1.246175,1.732379,1.460041,0.740152,3.0,0.205055


In [19]:
#save csvs
df_train.to_csv('../housing/data/train.csv', index=False)
df_val.to_csv('../housing/data/validation.csv', index=False)
df_test.to_csv('../housing/data/test.csv', index=False)

In [20]:
def reverse_standardize_series(s, original_mean, original_std_dev):
    return s * original_std_dev + original_mean


0


In [24]:
reverse_standardize_series(.90649, train_mean, train_std)

312520.2780866431