In [1]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

df = pd.read_csv(Path('..')/'data'/'listings.csv', low_memory=False)

def clean_price(s):
    return (s.astype(str)
             .str.replace(r'[\$,]', '', regex=True)
             .str.replace(r'\.00$', '', regex=True)
             .replace('nan', np.nan)
             .astype(float))
df['price'] = clean_price(df['price'])

target = 'price'
use_cols = [c for c in [
    'room_type',
    'neighbourhood_cleansed',
    'latitude','longitude',
    'minimum_nights',
    'number_of_reviews','reviews_per_month',
    'availability_365',
    'calculated_host_listings_count'
] if c in df.columns]

X = df[use_cols].copy()
y = df[target].copy()

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()
print('Use cols:', use_cols)
print('Num:', len(num_cols), 'Cat:', len(cat_cols))

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.40, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)
print('Before:', X.shape)
print('Train/Val/Test:', X_train.shape, X_val.shape, X_test.shape)

num_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pre = ColumnTransformer([
    ('num', num_pipe, num_cols),
    ('cat', cat_pipe, cat_cols)
])

pipe = Pipeline([('pre', pre)])
X_train_t = pipe.fit_transform(X_train)
X_val_t   = pipe.transform(X_val)
X_test_t  = pipe.transform(X_test)

print('After (features):', X_train_t.shape, X_val_t.shape, X_test_t.shape)

ohe = pipe.named_steps['pre'].named_transformers_['cat'].named_steps['ohe']
print('One-Hot sizes:', list(zip(cat_cols, [len(c) for c in ohe.categories_])))


Use cols: ['room_type', 'neighbourhood_cleansed', 'latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365', 'calculated_host_listings_count']
Num: 7 Cat: 2
Before: (3585, 9)
Train/Val/Test: (2151, 9) (717, 9) (717, 9)
After (features): (2151, 35) (717, 35) (717, 35)
One-Hot sizes: [('room_type', 3), ('neighbourhood_cleansed', 25)]
