In [7]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, StackingRegressor, VotingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, LassoLarsIC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")
from data_utils import load_data, log_target, inv_log_target
from features import FillMissingTransformer, NumericSelector, CatagoricalSelector

In [6]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [8]:
# 2. Target transformation
train["price_log"] = log_target(train["price"])

In [19]:
train.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus', 'price_log'],
      dtype='object')

#### 3. Feature selection: simple numeric + some categorical as example
#### you can expand this list with domain features

In [29]:
numeric_feats = train.select_dtypes(include=[np.number]).columns.tolist()
categorical_feats = train.select_dtypes(include=['object']).columns.tolist()

In [30]:
numeric_feats.remove('price_log')
numeric_feats.remove('price')

### Custom Pipeline

In [31]:
numeric_pipeline = Pipeline(steps=[
  ('num_selector', NumericSelector(numeric_feats)),
  ('fill_missing', FillMissingTransformer()),
  ('scaler', StandardScaler())
])

In [32]:
categorical_pipeline = Pipeline(steps=[
  ('cat_selector', CatagoricalSelector(categorical_feats)),
  ('imputer', FillMissingTransformer()),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

###  5. ColumnTransformer to Combine

In [33]:
preprocessor = ColumnTransformer(
    transformers=[
      ('numeric', numeric_pipeline, numeric_feats),
      ('categorical', categorical_pipeline, categorical_feats)
    ], remainder='drop')

#### Pipeline using Built in function

In [36]:
# 4. Preprocessors
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent", fill_value="None")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor1 = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_feats),
    ("cat", categorical_transformer, categorical_feats)
], remainder="drop")