In [325]:
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns
import string
import time

from datetime import datetime
from scipy.stats import uniform, randint, zscore
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.metrics import ( 
    mean_absolute_error, 
    mean_absolute_percentage_error, 
    mean_squared_error,
    r2_score
)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.pipeline import FeatureUnion, Pipeline 
from unidecode import unidecode
from xgboost.sklearn import XGBRegressor

# Data Collection

## Downloading data

In [326]:
# python download_data.py

## Combining data from all chunks into one file

In [327]:
# cat data/offers_data*.txt | grep -v '"error"' > data/offers_data.txt

# Data Cleaning & Preparation

In [328]:
pd.set_option('display.max_columns', None)

## Loading and reviewing data

In [329]:
df = pd.read_json('data/offers_data.txt', lines=True)

In [330]:
df['number_of_rooms'] = df.numberOfRooms
df['number_of_floors'] = df.numberOfFloors
df['building_type'] = df.type
df['windows_orientation'] = df.windowsOrientation
df = df.drop(columns=['numberOfRooms', 'numberOfFloors', 'type', 'windowsOrientation'])

## Imputing missing values

In [331]:
multiple_values_columns = ['equipment', 'areas', 'conveniences', 'security', 'media_types']
for column in multiple_values_columns:
    types = df[column].dropna().apply(lambda x: x.split(', ') if x else None).explode().unique()
    for t in types:
        df[t] = df[column].apply(lambda x: 1 if x is not None and t in x else 0)
df = df.drop(columns=multiple_values_columns)

In [332]:
df = df.drop(columns=['internet'])

In [333]:
df = df.drop(columns=['lift'])

In [334]:
df['separate_kitchen'] = df.kitchen.map(lambda k: 1 if k else 0)
df = df.drop(columns=['kitchen'])

In [335]:
building_type_grp = df.groupby(['building_type'])
building_type_grp['heating'].value_counts(normalize=True)

building_type  heating    
APARTMENT      URBAN          0.735046
               GAS            0.148202
               OTHER          0.071283
               BOILER_ROOM    0.028369
               ELECTRICAL     0.016969
               TILED_STOVE    0.000131
BLOCK          URBAN          0.806867
               GAS            0.114169
               OTHER          0.046790
               BOILER_ROOM    0.022180
               ELECTRICAL     0.008734
               TILED_STOVE    0.001261
HOUSE          GAS            0.540778
               OTHER          0.210790
               URBAN          0.114806
               BOILER_ROOM    0.064617
               ELECTRICAL     0.056462
               TILED_STOVE    0.012547
INFILL         URBAN          0.855263
               GAS            0.078947
               BOILER_ROOM    0.039474
               OTHER          0.013158
               ELECTRICAL     0.013158
LOFT           URBAN          0.808219
               GAS            0.16438

In [336]:
for building_type in df.building_type.dropna().unique():
    heating_mode = df[df['building_type'] == building_type]['heating'].mode()
    heating_mode = heating_mode[0] if heating_mode.any() else df['heating'].mode()[0]
    df.loc[df['building_type'] == building_type, 'heating'] = df['heating'].fillna(heating_mode)

In [337]:
df.windows.fillna(df.windows.mode()[0], inplace=True)
df.ownership.fillna(df.ownership.mode()[0], inplace=True)

## Cleaning data

In [338]:
unique_counts = df.nunique()
columns_with_one_unique_value = unique_counts[unique_counts == 1].index.tolist()
columns_with_one_unique_value

['propertype', 'offertype']

In [339]:
df = df.drop(columns=['parking', 'rooms', 'windows_orientation', 'free_from', 'rent', 'rent_currency', 'material', 'street', 'propertype', 'offertype'])

In [340]:
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [341]:
unique_counts = df.nunique()
columns_with_one_unique_value = unique_counts[unique_counts == 1].index.tolist()
columns_with_one_unique_value

['cable_television', 'electricity', 'sewage', 'water', 'gas']

In [342]:
df = df.drop(columns=columns_with_one_unique_value)

In [343]:
df = df.drop(columns=['subregion_id'])  # too many unique values, information conveyed in region_name

In [344]:
df = df.drop(columns=['user_type'])  # irrelevant

# Feature Engineering

## Adjusting existing features

### floor

In [345]:
df.floor.value_counts()

floor
FLOOR_1            10379
GROUND_FLOOR        8599
FLOOR_2             7859
FLOOR_3             6377
FLOOR_4             4116
FLOOR_5             1348
FLOOR_6              829
FLOOR_7              547
FLOOR_HIGHER_10      416
FLOOR_8              377
FLOOR_9              273
FLOOR_10             252
GARRET                72
CELLAR                29
Name: count, dtype: int64

In [346]:
def process_floor(floor):
    if floor == 'CELLAR':
        return -1
    elif floor == 'GROUND_FLOOR':
        return 0
    elif floor == 'FLOOR_HIGHER_10':
        return 11
    elif floor == 'GARRET':
        return 12
    else:
        return int(floor.split('_')[-1])

In [347]:
df.floor = df.floor.apply(process_floor)

### windows

In [348]:
df.windows.value_counts(normalize=True)

windows
PLASTIC      0.939864
WOODEN       0.048296
ALUMINIUM    0.011839
Name: proportion, dtype: float64

In [349]:
df.windows = df.windows.replace(['WOODEN', 'ALUMINIUM'], 'NON_PLASTIC')

### heating

In [350]:
df.heating.value_counts(normalize=True)

heating
URBAN          0.697466
GAS            0.201119
OTHER          0.056591
BOILER_ROOM    0.022931
ELECTRICAL     0.020061
TILED_STOVE    0.001833
Name: proportion, dtype: float64

In [351]:
df.heating = df.heating.replace(['TILED_STOVE', 'ELECTRICAL', 'BOILER_ROOM'], 'OTHER')

### condition

In [352]:
df.condition.value_counts(normalize=True)

condition
READY_TO_USE     0.508114
TO_COMPLETION    0.408844
TO_RENOVATION    0.083042
Name: proportion, dtype: float64

In [353]:
df.condition = df.condition.replace('TO_RENOVATION', 'TO_COMPLETION')

### ownership

In [354]:
df.ownership.value_counts(normalize=True)

ownership
FULL_OWNERSHIP       0.953849
LIMITED_OWNERSHIP    0.041834
SHARE                0.003231
USUFRUCT             0.001085
Name: proportion, dtype: float64

In [355]:
df.ownership = df.ownership.replace(['SHARE', 'USUFRUCT'], 'LIMITED_OWNERSHIP')

### price_currency

In [356]:
df.price_currency.unique()

array(['PLN', 'EUR'], dtype=object)

In [357]:
df.loc[df['price_currency'] == 'EUR', 'ad_price'] = df['ad_price'] * 4.35
df.loc[df['price_currency'] == 'USD', 'ad_price'] = df['ad_price'] * 4
df = df.drop(columns=['price_currency'])

### city_name

In [358]:
punctuation_chars = set(string.punctuation)
whitespace_chars = set(string.whitespace)
special_chars = punctuation_chars - {'-'} | whitespace_chars

df['city_name'] = df['city_name'].apply(lambda x: '_'.join(x.split('_')[:-1]) if x.split('_')[-1].isdigit() else x)

### building_type

In [359]:
df.building_type.value_counts(normalize=True)

building_type
BLOCK        0.546283
APARTMENT    0.279965
TENEMENT     0.115545
RIBBON       0.030839
HOUSE        0.024691
INFILL       0.001350
LOFT         0.001326
Name: proportion, dtype: float64

In [360]:
df.building_type = df.building_type.replace(['INFILL', 'LOFT', 'HOUSE', 'RIBBON'], 'OTHER')

## Adding new features

### building_age

In [361]:
df['building_age'] = df['year'].apply(lambda x: age if (age := datetime.now().year - x) >= 0 else 0)

### population, city_area, population_density

In [362]:
df_cities_data = pd.read_csv('data/cities_data.txt', sep=',')  # https://pl.wikipedia.org/wiki/Dane_statystyczne_o_miastach_w_Polsce

In [363]:
df_cities_data.head()

Unnamed: 0,city_name,subregion_id,region_name,city_area,population,population_density
0,Aleksandrów Kujawski,aleksandrowski,kujawsko-pomorskie,723,12058,1668
1,Aleksandrów Łódzki,zgierski,łódzkie,1382,21754,1574
2,Alwernia,chrzanowski,małopolskie,888,3336,376
3,Andrychów,wadowicki,małopolskie,1033,19837,1920
4,Annopol,kraśnicki,lubelskie,773,2436,315


In [364]:
df_cities_data['city_name'] = df_cities_data['city_name'].apply(lambda x: '-'.join(unidecode(x).lower().split()))

In [365]:
df_cities_data.head()

Unnamed: 0,city_name,subregion_id,region_name,city_area,population,population_density
0,aleksandrow-kujawski,aleksandrowski,kujawsko-pomorskie,723,12058,1668
1,aleksandrow-lodzki,zgierski,łódzkie,1382,21754,1574
2,alwernia,chrzanowski,małopolskie,888,3336,376
3,andrychow,wadowicki,małopolskie,1033,19837,1920
4,annopol,kraśnicki,lubelskie,773,2436,315


In [366]:
df[df.city_name.apply(lambda x: x not in df_cities_data.city_name.tolist())].__len__()

3041

In [367]:
cities_names = df_cities_data.city_name.to_list()
def create_feature(city_name, feature_name):
    if not city_name in cities_names:
        return None
    else:
        return df_cities_data[df_cities_data.city_name == city_name][feature_name].iloc[0]

In [368]:
df['population'] = df.city_name.apply(create_feature, args=('population',))

In [369]:
df['city_area'] = df.city_name.apply(create_feature, args=('city_area',))

In [370]:
df['population_density'] = df.city_name.apply(create_feature, args=('population_density',))

In [371]:
df.drop(columns=['city_name'], inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

In [372]:
df_cleaned = df

In [373]:
df_cleaned.__len__()

38432

In [374]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38432 entries, 0 to 38431
Data columns (total 44 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   floor               38432 non-null  int64  
 1   year                38432 non-null  float64
 2   windows             38432 non-null  object 
 3   heating             38432 non-null  object 
 4   condition           38432 non-null  object 
 5   ownership           38432 non-null  object 
 6   lat                 38432 non-null  float64
 7   long                38432 non-null  float64
 8   ad_price            38432 non-null  float64
 9   market              38432 non-null  object 
 10  region_name         38432 non-null  object 
 11  area                38432 non-null  float64
 12  number_of_rooms     38432 non-null  int64  
 13  number_of_floors    38432 non-null  float64
 14  building_type       38432 non-null  object 
 15  dishwasher          38432 non-null  int64  
 16  furn

# Data Preprocessing

In [313]:
class SelectColumnsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_select: list):
        self.columns_to_select = columns_to_select
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.columns_to_select]

In [261]:
class LogTransformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self    
    
    def transform(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X:
            X[col] = X[col].apply(lambda x: 0 if x==0 else np.log(x))
        return X

In [306]:
class DropNaTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        X = X.dropna().reset_index(drop=True)
        return X

In [263]:
class ConcatenateFeaturesTargetTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target = None
    
    def fit(self, X, y=None):
        self.target = y
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        X = X.rename(str, axis="columns")
        y = pd.DataFrame(self.target)
        return pd.concat([X, y], axis=1)

## Outliers

In [387]:
class RemoveOutliersTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, method='iqr', threshold=3, q1=0.25, q3=0.75):
        _allowed_methods = ['iqr', 'zscore']
        if method not in _allowed_methods:
            raise ValueError(f"Allowed methods: {_allowed_methods}, given {method}")
        self.method = method
        self.threshold = threshold
        self.q1 = q1
        self.q3 = q3
        self.outliers_columns = []
        
    def fit(self, X, y=None):
        self._get_outliers_columns(X)
        return self 
    
    def transform(self, X, y=None):
        if self.method == 'zscore':
            for column in self.outliers_columns:
                z_scores = np.abs(zscore(X[column]))
                outliers_mask = z_scores > self.threshold
                X.loc[outliers_mask, column] = None
        else:
            for column in self.outliers_columns:
                Q1 = X[column].quantile(self.q1)
                Q3 = X[column].quantile(self.q3)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                outliers_mask = (X[column] < lower_bound) | (X[column] > upper_bound)
                X.loc[outliers_mask, column] = None
        return X

    def _get_outliers_columns(self, X):
        numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
        for column in numerical_columns:
            if all(value in [0, 1] for value in X[column].unique()):
                continue
            if self.method == 'zscore':
                outliers = X[abs(zscore(X[column])) > self.threshold]
            else:
                Q1 = X[column].quantile(self.q1)
                Q3 = X[column].quantile(self.q3)
                IQR = Q3 - Q1
                outliers = X[(X[column] < (Q1 - 1.5 * IQR)) | (X[column] > (Q3 + 1.5 * IQR))]
            num_outliers = len(outliers)
            if num_outliers:
                self.outliers_columns.append(column)

## Feature selection

In [293]:
class SelectFeaturesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, target_name, model=LinearRegression(), corr_threshold=0.5, n_forward_backward_features=20):
        self.target_name = target_name
        self.model = model
        self.corr_threshold = corr_threshold
        self.n_forward_backward_features = n_forward_backward_features
        self.selected_corr = []
        self.selected_forward = []
        self.selected_backward = []
    
    def fit(self, X, y=None):
        self.selected_corr = (
            x := X.corr(numeric_only=True).abs()[self.target_name]
        ).sort_values(ascending=False)[(x > self.corr_threshold) & (x != 1)].index.to_list()

        y = X[self.target_name]
        X = X.drop(columns=[self.target_name])
        
        sfs = SequentialFeatureSelector(
            self.model, direction='forward', scoring='r2', n_features_to_select=self.n_forward_backward_features)
        sfs.fit(X, y)
        choices = sfs.get_support()
        self.selected_forward = [
            feature for feature, is_selected in zip(X.columns, choices) if is_selected
        ]
        
        sfs = SequentialFeatureSelector(
            self.model, direction='backward', scoring='r2', n_features_to_select=self.n_forward_backward_features)
        sfs.fit(X, y)
        choices = sfs.get_support()
        self.selected_backward = [
            feature for feature, is_selected in zip(X.columns, choices) if is_selected
        ]
        return self

    def transform(self, X, y=None):
        selected_features = list(
            set(self.selected_corr).union(
                set(self.selected_forward).intersection(set(self.selected_backward))
            )
        )
        return X[selected_features]

In [296]:
def transform_df(X, y, outliers_skewed, outliers_near_normal, scaler_near_normal, ohe_drop):
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    non_01_columns = [column for column in numerical_columns if not all(value in [0, 1] for value in df[column].unique())]
    
    skewed_columns = [column for column in numerical_columns if np.abs(df[column].skew()) > 0.5]
    skewed_transformer = Pipeline(steps=[
        ('select_cat_columns', SelectColumnsTransformer(skewed_columns)),
        ('remove_outliers', RemoveOutliersTransformer(method=outliers_skewed)),
        ('scaler', LogTransformer()),
    ])
    
    near_normal_columns = [column for column in non_01_columns if column not in skewed_columns]
    near_normal_transformer = Pipeline(steps=[
        ('select_num_columns', SelectColumnsTransformer(near_normal_columns)),
        ('remove_outliers', RemoveOutliersTransformer(method=outliers_near_normal)),
        ('scaler', scaler_near_normal)
    ])
    
    categorical_columns = X.select_dtypes(include="object").columns.tolist()
    categorical_transformer = Pipeline(steps=[
        ('select_cat_columns', SelectColumnsTransformer(categorical_columns)), 
        ('ohe', OneHotEncoder(drop=ohe_drop, sparse_output=False, dtype=int))
    ])
    
    transformer = ColumnTransformer(transformers=[
        ('skewed', skewed_transformer, skewed_columns), 
        ('near_normal', near_normal_transformer, near_normal_columns), 
        ('categorical', categorical_transformer, categorical_columns),
    ])
    
    pipeline = Pipeline(steps=[
        ('preprocessor', transformer),
        ('concat', ConcatenateFeaturesTargetTransformer()),
        ('dropna', DropNaTransformer())
    ])
    
    df_transformed = pd.DataFrame(pipeline.fit_transform(X, y))
    
    return df_transformed

# Model Building & Evaluation

In [314]:
def get_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    r2 = r2_score(y_test, y_pred)
    return dict(mae=mae, mape=mape, r2=r2)

In [315]:
# XGBRegressor

distributions = {
    'learning_rate': uniform(0.01, 0.4),
    'max_depth': randint(3, 8),
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 2),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'alpha': uniform(0, 1),
    'lambda': uniform(0, 2)
}

In [376]:
X_cleaned = df_cleaned.drop(columns=['ad_price'])
y_cleaned = df_cleaned['ad_price']

In [381]:
def build_and_evaluate_models(
    X, y, target_name,
    outliers_skewed, outliers_near_normal,
    scaler_near_normal,
    ohe_drop,
    corr_threshold, n_forward_backward_features,
    test_size
):
    df_transformed = transform_df(
        X, y, outliers_skewed, outliers_near_normal, scaler_near_normal, ohe_drop
    )
    
    sft = SelectFeaturesTransformer(target_name=target_name, corr_threshold=corr_threshold, n_forward_backward_features=n_forward_backward_features)
    sft.fit(df_transformed)
    selected_X = sft.transform(df_transformed)
    selected_X.info()
    
    X = selected_X
    y = df_transformed['ad_price']
    
    print('\n====== LINEAR REGRESSION ======')
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    linear_reg_model = LinearRegression()
    linear_reg_model.fit(X_train, y_train)
    
    print('TEST SET:')
    y_pred = linear_reg_model.predict(X_test)
    print(get_metrics(y_test, y_pred))
    
    print('TRAINING SET:')
    y_pred_train = linear_reg_model.predict(X_train)
    print(get_metrics(y_train, y_pred_train))
    
    
    print('\n====== XGBOOST REGRESSOR ======')
    
    xgb_reg_model = XGBRegressor(random_state=42)
    optimizer = RandomizedSearchCV(
        xgb_reg_model, 
        distributions, 
        n_iter=100,
        n_jobs=-1
    )
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    optimizer.fit(X_train, y_train)
    best_optimizer_params = optimizer.best_estimator_.get_params()
    
    for k, v in best_optimizer_params.items():
        if v:
            print(f'{k}: {v}')
    print('---------')
    print('TEST SET:')
    y_pred = optimizer.predict(X_test)
    print(get_metrics(y_test, y_pred))
    
    print('TRAINING SET:')
    y_pred_train = optimizer.predict(X_train)
    print(get_metrics(y_train, y_pred_train))

In [382]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='iqr',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop=None,
    corr_threshold=0.05, n_forward_backward_features=40,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 37 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      27330 non-null  float64
 1   53      27330 non-null  float64
 2   35      27330 non-null  float64
 3   8       27330 non-null  float64
 4   57      27330 non-null  float64
 5   44      27330 non-null  float64
 6   43      27330 non-null  float64
 7   6       27330 non-null  float64
 8   32      27330 non-null  float64
 9   1       27330 non-null  float64
 10  13      27330 non-null  float64
 11  12      27330 non-null  float64
 12  28      27330 non-null  float64
 13  2       27330 non-null  float64
 14  10      27330 non-null  float64
 15  36      27330 non-null  float64
 16  14      27330 non-null  float64
 17  9       27330 non-null  float64
 18  27      27330 non-null  float64
 19  38      27330 non-null  float64
 20  58      27330 non-null  float64
 21  25      27330 non-null  float64
 22

In [383]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='iqr',
    scaler_near_normal=StandardScaler(),
    ohe_drop='first',
    corr_threshold=0.25, n_forward_backward_features=30,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      27330 non-null  float64
 1   25      27330 non-null  float64
 2   40      27330 non-null  float64
 3   53      27330 non-null  float64
 4   46      27330 non-null  float64
 5   35      27330 non-null  float64
 6   26      27330 non-null  float64
 7   34      27330 non-null  float64
 8   37      27330 non-null  float64
 9   1       27330 non-null  float64
 10  54      27330 non-null  float64
 11  3       27330 non-null  float64
 12  47      27330 non-null  float64
 13  49      27330 non-null  float64
 14  2       27330 non-null  float64
 15  30      27330 non-null  float64
 16  51      27330 non-null  float64
 17  45      27330 non-null  float64
 18  27      27330 non-null  float64
 19  52      27330 non-null  float64
 20  38      27330 non-null  float64
 21  4       27330 non-null  float64
dty

In [384]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='iqr',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop='if_binary',
    corr_threshold=0.1, n_forward_backward_features=20,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      27330 non-null  float64
 1   40      27330 non-null  float64
 2   53      27330 non-null  float64
 3   35      27330 non-null  float64
 4   43      27330 non-null  float64
 5   1       27330 non-null  float64
 6   2       27330 non-null  float64
 7   36      27330 non-null  float64
 8   27      27330 non-null  float64
 9   25      27330 non-null  float64
 10  39      27330 non-null  float64
 11  26      27330 non-null  float64
 12  48      27330 non-null  float64
 13  54      27330 non-null  float64
 14  3       27330 non-null  float64
 15  47      27330 non-null  float64
 16  49      27330 non-null  float64
 17  30      27330 non-null  float64
 18  55      27330 non-null  float64
 19  51      27330 non-null  float64
 20  4       27330 non-null  float64
dtypes: float64(21)
memory usage: 4.4 MB

In [385]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='iqr',
    scaler_near_normal=StandardScaler(),
    ohe_drop=None,
    corr_threshold=0.1, n_forward_backward_features=10,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   58      27330 non-null  float64
 1   25      27330 non-null  float64
 2   31      27330 non-null  float64
 3   53      27330 non-null  float64
 4   39      27330 non-null  float64
 5   26      27330 non-null  float64
 6   37      27330 non-null  float64
 7   32      27330 non-null  float64
 8   1       27330 non-null  float64
 9   3       27330 non-null  float64
 10  47      27330 non-null  float64
 11  2       27330 non-null  float64
 12  36      27330 non-null  float64
 13  30      27330 non-null  float64
 14  27      27330 non-null  float64
 15  52      27330 non-null  float64
 16  38      27330 non-null  float64
 17  59      27330 non-null  float64
dtypes: float64(18)
memory usage: 3.8 MB

TEST SET:
{'mae': 134524.6918496158, 'mape': 25.85997321709701, 'r2': 0.49409384594979733}
TRAINING SET:
{'ma

In [388]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='zscore', outliers_near_normal='zscore',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop=None,
    corr_threshold=0.05, n_forward_backward_features=40,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36852 entries, 0 to 36851
Data columns (total 36 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      36852 non-null  float64
 1   53      36852 non-null  float64
 2   40      36852 non-null  float64
 3   8       36852 non-null  float64
 4   57      36852 non-null  float64
 5   41      36852 non-null  float64
 6   0       36852 non-null  float64
 7   44      36852 non-null  float64
 8   43      36852 non-null  float64
 9   6       36852 non-null  float64
 10  32      36852 non-null  float64
 11  50      36852 non-null  float64
 12  1       36852 non-null  float64
 13  28      36852 non-null  float64
 14  2       36852 non-null  float64
 15  33      36852 non-null  float64
 16  36      36852 non-null  float64
 17  45      36852 non-null  float64
 18  27      36852 non-null  float64
 19  38      36852 non-null  float64
 20  29      36852 non-null  float64
 21  58      36852 non-null  float64
 22

In [389]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='zscore', outliers_near_normal='zscore',
    scaler_near_normal=StandardScaler(),
    ohe_drop='first',
    corr_threshold=0.25, n_forward_backward_features=30,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36852 entries, 0 to 36851
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      36852 non-null  float64
 1   25      36852 non-null  float64
 2   40      36852 non-null  float64
 3   53      36852 non-null  float64
 4   46      36852 non-null  float64
 5   35      36852 non-null  float64
 6   26      36852 non-null  float64
 7   42      36852 non-null  float64
 8   41      36852 non-null  float64
 9   0       36852 non-null  float64
 10  34      36852 non-null  float64
 11  54      36852 non-null  float64
 12  3       36852 non-null  float64
 13  47      36852 non-null  float64
 14  2       36852 non-null  float64
 15  36      36852 non-null  float64
 16  30      36852 non-null  float64
 17  51      36852 non-null  float64
 18  45      36852 non-null  float64
 19  27      36852 non-null  float64
 20  52      36852 non-null  float64
 21  38      36852 non-null  float64
 22

In [390]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='zscore', outliers_near_normal='zscore',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop='if_binary',
    corr_threshold=0.1, n_forward_backward_features=20,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36852 entries, 0 to 36851
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      36852 non-null  float64
 1   25      36852 non-null  float64
 2   40      36852 non-null  float64
 3   53      36852 non-null  float64
 4   35      36852 non-null  float64
 5   26      36852 non-null  float64
 6   44      36852 non-null  float64
 7   43      36852 non-null  float64
 8   48      36852 non-null  float64
 9   54      36852 non-null  float64
 10  3       36852 non-null  float64
 11  49      36852 non-null  float64
 12  2       36852 non-null  float64
 13  36      36852 non-null  float64
 14  30      36852 non-null  float64
 15  55      36852 non-null  float64
 16  27      36852 non-null  float64
 17  4       36852 non-null  float64
 18  29      36852 non-null  float64
dtypes: float64(19)
memory usage: 5.3 MB

TEST SET:
{'mae': 197151.14148613007, 'mape': 34.74769061160256, 'r2': 0

In [391]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='zscore', outliers_near_normal='zscore',
    scaler_near_normal=StandardScaler(),
    ohe_drop=None,
    corr_threshold=0.1, n_forward_backward_features=10,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36852 entries, 0 to 36851
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   58      36852 non-null  float64
 1   31      36852 non-null  float64
 2   53      36852 non-null  float64
 3   26      36852 non-null  float64
 4   37      36852 non-null  float64
 5   32      36852 non-null  float64
 6   48      36852 non-null  float64
 7   3       36852 non-null  float64
 8   47      36852 non-null  float64
 9   2       36852 non-null  float64
 10  36      36852 non-null  float64
 11  30      36852 non-null  float64
 12  27      36852 non-null  float64
 13  52      36852 non-null  float64
 14  59      36852 non-null  float64
 15  29      36852 non-null  float64
dtypes: float64(16)
memory usage: 4.5 MB

TEST SET:
{'mae': 204296.82037715372, 'mape': 35.91971069564238, 'r2': 0.5048680689989395}
TRAINING SET:
{'mae': 203933.2878786337, 'mape': 35.2292537151221, 'r2': 0.5147369669834359}

In [392]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='zscore',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop=None,
    corr_threshold=0.5, n_forward_backward_features=40,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27423 entries, 0 to 27422
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   61      27423 non-null  float64
 1   31      27423 non-null  float64
 2   25      27423 non-null  float64
 3   39      27423 non-null  float64
 4   57      27423 non-null  float64
 5   26      27423 non-null  float64
 6   6       27423 non-null  float64
 7   32      27423 non-null  float64
 8   48      27423 non-null  float64
 9   56      27423 non-null  float64
 10  47      27423 non-null  float64
 11  2       27423 non-null  float64
 12  36      27423 non-null  float64
 13  5       27423 non-null  float64
 14  55      27423 non-null  float64
 15  45      27423 non-null  float64
 16  60      27423 non-null  float64
 17  52      27423 non-null  float64
 18  38      27423 non-null  float64
 19  7       27423 non-null  float64
 20  59      27423 non-null  float64
dtypes: float64(21)
memory usage: 4.4 MB

In [393]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='zscore', outliers_near_normal='iqr',
    scaler_near_normal=StandardScaler(),
    ohe_drop='first',
    corr_threshold=0.25, n_forward_backward_features=30,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36740 entries, 0 to 36739
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   31      36740 non-null  float64
 1   25      36740 non-null  float64
 2   53      36740 non-null  float64
 3   46      36740 non-null  float64
 4   35      36740 non-null  float64
 5   26      36740 non-null  float64
 6   42      36740 non-null  float64
 7   41      36740 non-null  float64
 8   0       36740 non-null  float64
 9   34      36740 non-null  float64
 10  54      36740 non-null  float64
 11  3       36740 non-null  float64
 12  47      36740 non-null  float64
 13  2       36740 non-null  float64
 14  30      36740 non-null  float64
 15  51      36740 non-null  float64
 16  45      36740 non-null  float64
 17  27      36740 non-null  float64
 18  52      36740 non-null  float64
 19  38      36740 non-null  float64
 20  4       36740 non-null  float64
dtypes: float64(21)
memory usage: 5.9 MB

In [396]:
build_and_evaluate_models(
    X=X_cleaned, y=y_cleaned, target_name='ad_price',
    outliers_skewed='iqr', outliers_near_normal='iqr',
    scaler_near_normal=MinMaxScaler(),
    ohe_drop=None,
    corr_threshold=0.01, n_forward_backward_features=10,
    test_size=0.2
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27330 entries, 0 to 27329
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   61      27330 non-null  float64
 1   31      27330 non-null  float64
 2   53      27330 non-null  float64
 3   40      27330 non-null  float64
 4   35      27330 non-null  float64
 5   41      27330 non-null  float64
 6   0       27330 non-null  float64
 7   44      27330 non-null  float64
 8   43      27330 non-null  float64
 9   32      27330 non-null  float64
 10  50      27330 non-null  float64
 11  1       27330 non-null  float64
 12  56      27330 non-null  float64
 13  28      27330 non-null  float64
 14  2       27330 non-null  float64
 15  33      27330 non-null  float64
 16  36      27330 non-null  float64
 17  45      27330 non-null  float64
 18  27      27330 non-null  float64
 19  38      27330 non-null  float64
 20  29      27330 non-null  float64
 21  58      27330 non-null  float64
 22