In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from tqdm import tqdm


pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
with open("numerical.txt", "r", encoding="utf-8") as file:
    numeric_features = [line.strip() for line in file.readlines()]
    
with open("cat.txt", "r", encoding="utf-8") as file:
    cat_features = [line.strip() for line in file.readlines()]
    
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [5]:
print('Total number of Numerical Features and one target:', len(numeric_features))
print('Total number of Categorical Features:',len(cat_features))
print('Total number of Text Features:',len(text_features))

Total number of Numerical Features and one target: 9
Total number of Categorical Features: 70
Total number of Text Features: 2


In [6]:
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lambdas_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for column in X.columns:
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            _, lambda_optimal = stats.boxcox(X[column])
            self.lambdas_[column] = lambda_optimal
        return self

    def transform(self, X):
        X = X.copy()
        for column, lambda_optimal in self.lambdas_.items():
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            X[column] = stats.boxcox(X[column], lmbda=lambda_optimal)
        return X
    
class TextCombiner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (X['summary'].fillna("") + " " + X['host_about'].fillna("")).str.lower()


In [7]:
lg = LinearRegression()
ridge = Ridge()
lasso = Lasso()
elastic = ElasticNet()

#rf = RandomForestRegressor()
xgb = XGBRegressor()
lgm = LGBMRegressor()
cat = CatBoostRegressor()

models = [('lg', lg), ('ridge', ridge), ('lasso', lasso), ('elastic', elastic), ('xgb', xgb), ('lgm', lgm),('cat', cat)]

In [8]:
train_X = train_df[numeric_features + cat_features + text_features]
train_y = train_df['price']

val_X = val_df[numeric_features + cat_features + text_features]
val_y = val_df['price']

test_X = test_df[numeric_features + cat_features + text_features]
test_y = test_df['price']

In [9]:
numeric_transformer = Pipeline([
    ('boxcox', BoxCoxTransformer()),
    ('scaler', StandardScaler())
])

text_transformer = Pipeline([
    ('combine', TextCombiner()),
    ('tfidf', TfidfVectorizer(max_features = 100))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features),
        ('tfidf', text_transformer, text_features),
        
    ]
)


In [10]:
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def evaluate_models(model):
    results = []

    for name, model in tqdm(model, desc="Evaluating Models"):
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regression', model)
        ])
        
        pipeline.fit(train_X, train_y)
        
        val_predict = pipeline.predict(val_X)
        
        val_mae = mean_absolute_error(val_y, val_predict)
        val_rmse = rmse(val_y, val_predict)
        
        
        test_predict = pipeline.predict(test_X)
        test_mae = mean_absolute_error(test_y, test_predict)
        test_rmse = rmse(test_y, test_predict)
       
        results.append({
            'model': name,
            'val_mae': val_mae,
            'val_rmse': val_rmse,
            'test_mae': test_mae,
            'test_rmse': test_rmse,
        })
    
    return results

In [11]:
res = evaluate_models(models)

Evaluating Models:  71%|██████████████████████████████████████████████▍                  | 5/7 [00:26<00:10,  5.14s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016627 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 26781
[LightGBM] [Info] Number of data points in the train set: 17941, number of used features: 402
[LightGBM] [Info] Start training from score 134.868792


Evaluating Models:  86%|███████████████████████████████████████████████████████▋         | 6/7 [00:30<00:04,  4.81s/it]

Learning rate set to 0.064608
0:	learn: 98.9631766	total: 161ms	remaining: 2m 40s
1:	learn: 96.4005618	total: 178ms	remaining: 1m 29s
2:	learn: 94.1269913	total: 196ms	remaining: 1m 5s
3:	learn: 92.1185515	total: 213ms	remaining: 53.1s
4:	learn: 90.2156313	total: 231ms	remaining: 46s
5:	learn: 88.5804279	total: 247ms	remaining: 40.9s
6:	learn: 86.9666718	total: 263ms	remaining: 37.3s
7:	learn: 85.5122089	total: 279ms	remaining: 34.6s
8:	learn: 84.1937375	total: 296ms	remaining: 32.6s
9:	learn: 83.0180221	total: 313ms	remaining: 31s
10:	learn: 81.9620532	total: 328ms	remaining: 29.5s
11:	learn: 81.0257436	total: 343ms	remaining: 28.3s
12:	learn: 80.1553922	total: 358ms	remaining: 27.1s
13:	learn: 79.3772465	total: 374ms	remaining: 26.3s
14:	learn: 78.6275264	total: 390ms	remaining: 25.6s
15:	learn: 77.9969538	total: 407ms	remaining: 25s
16:	learn: 77.3485679	total: 424ms	remaining: 24.5s
17:	learn: 76.7890650	total: 441ms	remaining: 24.1s
18:	learn: 76.2645369	total: 461ms	remaining: 23

Evaluating Models: 100%|█████████████████████████████████████████████████████████████████| 7/7 [00:46<00:00,  6.68s/it]


In [12]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
6,cat,48.678431,275.11301,45.089656,135.614606
5,lgm,49.556822,275.11619,45.852156,138.230889
4,xgb,51.314067,276.658519,46.734344,135.955662
1,ridge,60.073701,281.227787,55.954827,149.636803
0,lg,60.508132,281.375625,56.23125,149.337118
2,lasso,60.701732,282.314485,56.833592,153.526881
3,elastic,62.635816,283.371473,58.123628,156.158334


In [13]:
import optuna

def objective(trial):
    
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.03)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    n_estimators = trial.suggest_int('n_estimators', 1000, 2000)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    alpha =  trial.suggest_loguniform('alpha', 1e-8, 10.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 1.0)

    model = XGBRegressor(
                learning_rate=learning_rate,
                max_depth=max_depth,
                n_estimators=n_estimators,
                subsample=subsample,
                alpha = alpha,
                colsample_bytree = colsample_bytree,
                random_state=42
            )
    
    pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('regression', model)
        ])
        
    pipeline.fit(train_X, train_y)

    val_predict = pipeline.predict(val_X)

    val_mae = mean_absolute_error(val_y, val_predict)
    val_rmse = rmse(val_y, val_predict)
    
    return val_mae

In [14]:
study3 = optuna.create_study(direction='minimize')
study3.optimize(objective, n_trials=50) 

[32m[I 2024-11-18 22:47:52,380][0m A new study created in memory with name: no-name-8273f1dd-4c09-4346-ae56-30d8676338d2[0m
[32m[I 2024-11-18 22:48:10,866][0m Trial 0 finished with value: 50.26612850250925 and parameters: {'learning_rate': 0.011029295656377525, 'max_depth': 4, 'n_estimators': 1440, 'subsample': 0.7129931185299188, 'alpha': 5.658482653704755e-07, 'colsample_bytree': 0.5930692827504584}. Best is trial 0 with value: 50.26612850250925.[0m
[32m[I 2024-11-18 22:48:38,156][0m Trial 1 finished with value: 47.60199977155851 and parameters: {'learning_rate': 0.026566025649999513, 'max_depth': 10, 'n_estimators': 1062, 'subsample': 0.8641086842388204, 'alpha': 0.35043887712435323, 'colsample_bytree': 0.43727320002637327}. Best is trial 1 with value: 47.60199977155851.[0m
[32m[I 2024-11-18 22:49:09,708][0m Trial 2 finished with value: 49.34219687361357 and parameters: {'learning_rate': 0.011331913889602222, 'max_depth': 5, 'n_estimators': 1484, 'subsample': 0.9132022445

In [15]:
xgb_best_param = study3.best_trial.params
print("Best trial for XGBRegressor:")
print(study3.best_trial.params)
print("Best Validation MAE:", study3.best_value)

Best trial for XGBRegressor:
{'learning_rate': 0.025759367735307244, 'max_depth': 9, 'n_estimators': 1315, 'subsample': 0.8376960771817454, 'alpha': 0.0008155611967585163, 'colsample_bytree': 0.6240760746490244}
Best Validation MAE: 47.43530266848932


In [16]:
best_xgb = XGBRegressor(**xgb_best_param)
best_models = [('xgb', best_xgb)]

In [17]:
res = evaluate_models(best_models)

Evaluating Models: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:36<00:00, 36.63s/it]


In [18]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
0,xgb,47.768326,275.328451,43.788042,138.038042
