In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from transformers import BertTokenizer, BertModel
from tqdm import tqdm

import torch
import torch.nn as nn

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
with open("numerical.txt", "r", encoding="utf-8") as file:
    numeric_features = [line.strip() for line in file.readlines()]
    
with open("cat.txt", "r", encoding="utf-8") as file:
    cat_features = [line.strip() for line in file.readlines()]
    
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [5]:
print('Total number of Numerical Features and one target:', len(numeric_features))
print('Total number of Categorical Features:',len(cat_features))
print('Total number of Text Features:',len(text_features))

Total number of Numerical Features and one target: 9
Total number of Categorical Features: 70
Total number of Text Features: 2


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')
model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [7]:
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lambdas_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for column in X.columns:
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            _, lambda_optimal = stats.boxcox(X[column])
            self.lambdas_[column] = lambda_optimal
        return self

    def transform(self, X):
        X = X.copy()
        for column, lambda_optimal in self.lambdas_.items():
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            X[column] = stats.boxcox(X[column], lmbda=lambda_optimal)
        return X

def get_bert_embeddings(text_series, batch_size):
    embeddings = []
    model.eval()  
    
    for i in range(0, len(text_series), batch_size):
        batch_texts = text_series[i:i + batch_size]
        
        if not batch_texts: 
            continue
        
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            # Get model output
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
    
    if embeddings:
        return np.vstack(embeddings)
    else:
        return np.array([])
    
def concat(df, text_features, batch_size, pooling, is_train = False):

    embeddings_list = []

    for feature in text_features:
        print(feature)
        text_embeddings = get_bert_embeddings(df[feature].tolist(), batch_size)

        if pooling == 'max':
            text_embeddings = np.max(text_embeddings, axis=1)  
        elif pooling == 'mean':
            text_embeddings = np.mean(text_embeddings, axis=1)  

        if text_embeddings.ndim == 1:
            text_embeddings = text_embeddings.reshape(-1, 1)

        embeddings_list.append(text_embeddings)
        
    if is_train:
        transformed_features = preprocessor.fit_transform(df).toarray()
    else:
        transformed_features = preprocessor.transform(df).toarray()

    combined_features = np.concatenate(embeddings_list + [transformed_features], axis=1)

    return combined_features

In [8]:
numeric_transformer = Pipeline([
    ('boxcox', BoxCoxTransformer()),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

In [9]:
train_combined_features = concat(train_df, text_features, batch_size = 32, pooling = 'mean', is_train = True)
val_combined_features = concat(val_df, text_features, batch_size = 32, pooling = 'mean', is_train = False)
test_combined_features = concat(test_df, text_features, batch_size = 32, pooling = 'mean', is_train = False)

summary
host_about
summary
host_about
summary
host_about


In [10]:
train_y = train_df['price']
val_y = val_df['price']
test_y = test_df['price']

In [11]:
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def evaluate_models(model):
    results = []

    for name, model in tqdm(model, desc="Evaluating Models"):
        
        model.fit(train_combined_features, train_y)
        
        val_predict = model.predict(val_combined_features)
        
        val_mae = mean_absolute_error(val_y, val_predict)
        val_rmse = rmse(val_y, val_predict)
        
        
        test_predict = model.predict(test_combined_features)
        test_mae = mean_absolute_error(test_y, test_predict)
        test_rmse = rmse(test_y, test_predict)
       
        results.append({
            'model': name,
            'val_mae': val_mae,
            'val_rmse': val_rmse,
            'test_mae': test_mae,
            'test_rmse': test_rmse,
        })
    
    return results

In [12]:
xgb = XGBRegressor()
lgm = LGBMRegressor()
cat = CatBoostRegressor()

models = [('xgb', xgb), ('lgm', lgm),('cat', cat)]

In [13]:
res = evaluate_models(models)

Evaluating Models:  67%|███████████████████████████████████████████▎                     | 2/3 [00:04<00:01,  1.90s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1792
[LightGBM] [Info] Number of data points in the train set: 17941, number of used features: 304
[LightGBM] [Info] Start training from score 134.868792
Learning rate set to 0.064608
0:	learn: 98.9517255	total: 132ms	remaining: 2m 11s
1:	learn: 96.4784396	total: 141ms	remaining: 1m 10s
2:	learn: 94.2334523	total: 148ms	remaining: 49.3s
3:	learn: 92.2694931	total: 156ms	remaining: 38.9s
4:	learn: 90.4411428	total: 163ms	remaining: 32.5s
5:	learn: 88.6965736	total: 171ms	remaining: 28.3s
6:	learn: 87.0439629	total: 178ms	remaining: 25.2s
7:	learn: 85.5756913	total: 186ms	remaining: 23s
8:	learn: 84.2978540	total: 193ms	remaining: 21.2s
9:	learn: 83.0879026	total: 201ms	remaining: 19.9s
10:	learn: 81.9851776	total: 209ms	remaining: 18.8s

Evaluating Models: 100%|█████████████████████████████████████████████████████████████████| 3/3 [00:17<00:00,  5.67s/it]


In [14]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
2,cat,49.025434,274.842429,45.307204,134.997461
1,lgm,49.268485,274.288485,45.590967,137.630692
0,xgb,50.205356,274.397772,45.987521,134.998949


In [15]:
import optuna

def objective(trial):
    
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.03)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    n_estimators = trial.suggest_int('n_estimators', 1000, 2000)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    alpha =  trial.suggest_loguniform('alpha', 1e-8, 10.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 1.0)
    
    model = XGBRegressor(
                learning_rate=learning_rate,
                max_depth=max_depth,
                n_estimators=n_estimators,
                subsample=subsample,
                alpha = alpha,
                colsample_bytree = colsample_bytree,
                random_state=42
            )
    
    model.fit(train_combined_features, train_y)
  
    val_predict = model.predict(val_combined_features)

    val_mae = mean_absolute_error(val_y, val_predict)
    val_rmse = rmse(val_y, val_predict)
    
    return val_mae

In [16]:
study3 = optuna.create_study(direction='minimize')
study3.optimize(objective, n_trials=50) 

[32m[I 2024-11-17 15:13:18,410][0m A new study created in memory with name: no-name-d205b76c-3665-49ad-ae4f-354298b888ea[0m
[32m[I 2024-11-17 15:14:01,063][0m Trial 0 finished with value: 48.14297330103225 and parameters: {'learning_rate': 0.02529155732114057, 'max_depth': 6, 'n_estimators': 1362, 'subsample': 0.982829883138274, 'alpha': 0.0005293240855312182, 'colsample_bytree': 0.7124954104696086}. Best is trial 0 with value: 48.14297330103225.[0m
[32m[I 2024-11-17 15:15:02,247][0m Trial 1 finished with value: 47.10209709554454 and parameters: {'learning_rate': 0.028176915211870907, 'max_depth': 9, 'n_estimators': 1422, 'subsample': 0.8527455782098172, 'alpha': 0.4278537201197314, 'colsample_bytree': 0.6626615875414801}. Best is trial 1 with value: 47.10209709554454.[0m
[32m[I 2024-11-17 15:15:36,453][0m Trial 2 finished with value: 47.75297630953904 and parameters: {'learning_rate': 0.015671600888699587, 'max_depth': 7, 'n_estimators': 1212, 'subsample': 0.795291918009722

In [17]:
xgb_best_param = study3.best_trial.params
print("Best trial for XGBRegressor:")
print(study3.best_trial.params)
print("Best Validation MAE:", study3.best_value)

Best trial for XGBRegressor:
{'learning_rate': 0.024248041404348896, 'max_depth': 9, 'n_estimators': 1496, 'subsample': 0.9148037908074077, 'alpha': 4.902595740683458, 'colsample_bytree': 0.5466765834567964}
Best Validation MAE: 46.74930055223806


In [18]:
best_xgb = XGBRegressor(**xgb_best_param)


best_models = [('xgb', best_xgb)]

In [19]:
res = evaluate_models(best_models)

Evaluating Models: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:53<00:00, 53.89s/it]


In [20]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
0,xgb,46.91407,273.861664,43.775337,136.390741
