In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from transformers import BertTokenizer, BertModel
from tqdm import tqdm

import torch
import torch.nn as nn

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
with open("numerical.txt", "r", encoding="utf-8") as file:
    numeric_features = [line.strip() for line in file.readlines()]
    
with open("cat.txt", "r", encoding="utf-8") as file:
    cat_features = [line.strip() for line in file.readlines()]
    
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [5]:
print('Total number of Numerical Features and one target:', len(numeric_features))
print('Total number of Categorical Features:',len(cat_features))
print('Total number of Text Features:',len(text_features))

Total number of Numerical Features and one target: 9
Total number of Categorical Features: 70
Total number of Text Features: 2


In [6]:
def combine_text(df):
    df['combined_text'] = df['summary'] + " [SEP] "  + df['host_about'] + " [SEP] "
    return 

In [7]:
combine_text(train_df)
combine_text(val_df)
combine_text(test_df)

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
embedding_model = BertModel.from_pretrained('bert-base-multilingual-cased')
embedding_model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [9]:
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lambdas_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for column in X.columns:
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            _, lambda_optimal = stats.boxcox(X[column])
            self.lambdas_[column] = lambda_optimal
        return self

    def transform(self, X):
        X = X.copy()
        for column, lambda_optimal in self.lambdas_.items():
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            X[column] = stats.boxcox(X[column], lmbda=lambda_optimal)
        return X



In [10]:
numeric_transformer = Pipeline([
    ('boxcox', BoxCoxTransformer()),
    ('scaler', StandardScaler())
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

In [11]:
def get_bert_embeddings(text_series, batch_size):
    embeddings = []
    embedding_model.eval()  
    
    for i in range(0, len(text_series), batch_size):
        batch_texts = text_series[i:i + batch_size]
        
        if not batch_texts: 
            continue
        
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True, max_length=128)
        inputs = {key: value.to(device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = embedding_model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(batch_embeddings)
    
    if embeddings:
        return np.vstack(embeddings)
    else:
        return np.array([])
    
def concat(df, text_features, batch_size, pooling, is_train = False):

    embeddings_list = []

    for feature in text_features:
        print(feature)
        text_embeddings = get_bert_embeddings(df[feature].tolist(), batch_size)
        
    embeddings_list.append(text_embeddings)
        
    if is_train:
        transformed_features = preprocessor.fit_transform(df).toarray()
    else:
        transformed_features = preprocessor.transform(df).toarray()

    combined_features = np.concatenate(embeddings_list + [transformed_features], axis=1)

    return combined_features

In [12]:
train_y = train_df['price']
val_y = val_df['price']
test_y = test_df['price']

In [13]:
text_features = ['combined_text']

In [14]:
train_combined_features = concat(train_df, text_features, batch_size = 32, pooling = 'max', is_train = True)
val_combined_features = concat(val_df, text_features, batch_size = 32, pooling = 'max', is_train = False)
test_combined_features = concat(test_df, text_features, batch_size = 32, pooling = 'max', is_train = False)

combined_text
combined_text
combined_text


In [15]:
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def evaluate_models(model):
    results = []

    for name, model in tqdm(model, desc="Evaluating Models"):
        
        model.fit(train_combined_features, train_y)
        
        val_predict = model.predict(val_combined_features)
        
        val_mae = mean_absolute_error(val_y, val_predict)
        val_rmse = rmse(val_y, val_predict)
        
        
        test_predict = model.predict(test_combined_features)
        test_mae = mean_absolute_error(test_y, test_predict)
        test_rmse = rmse(test_y, test_predict)
       
        results.append({
            'model': name,
            'val_mae': val_mae,
            'val_rmse': val_rmse,
            'test_mae': test_mae,
            'test_rmse': test_rmse,
        })
    
    return results

In [16]:
xgb = XGBRegressor()
lgm = LGBMRegressor()
cat = CatBoostRegressor()

models = [('xgb', xgb), ('lgm', lgm),('cat', cat)]

In [17]:
res = evaluate_models(models)

Evaluating Models:  33%|█████████████████████▋                                           | 1/3 [00:29<00:58, 29.17s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197122
[LightGBM] [Info] Number of data points in the train set: 17941, number of used features: 1070
[LightGBM] [Info] Start training from score 134.868792


Evaluating Models:  67%|███████████████████████████████████████████▎                     | 2/3 [00:32<00:14, 14.19s/it]

Learning rate set to 0.064608
0:	learn: 99.3973049	total: 210ms	remaining: 3m 29s
1:	learn: 96.8164823	total: 263ms	remaining: 2m 11s
2:	learn: 94.5700470	total: 316ms	remaining: 1m 45s
3:	learn: 92.5905097	total: 371ms	remaining: 1m 32s
4:	learn: 90.7642846	total: 428ms	remaining: 1m 25s
5:	learn: 88.9525255	total: 483ms	remaining: 1m 20s
6:	learn: 87.4665115	total: 536ms	remaining: 1m 16s
7:	learn: 85.9902933	total: 587ms	remaining: 1m 12s
8:	learn: 84.7202966	total: 643ms	remaining: 1m 10s
9:	learn: 83.6343599	total: 696ms	remaining: 1m 8s
10:	learn: 82.5816892	total: 749ms	remaining: 1m 7s
11:	learn: 81.5967751	total: 801ms	remaining: 1m 5s
12:	learn: 80.6750128	total: 853ms	remaining: 1m 4s
13:	learn: 79.8076188	total: 905ms	remaining: 1m 3s
14:	learn: 79.1385582	total: 955ms	remaining: 1m 2s
15:	learn: 78.3837525	total: 1.01s	remaining: 1m 1s
16:	learn: 77.8123726	total: 1.06s	remaining: 1m 1s
17:	learn: 77.1764288	total: 1.11s	remaining: 1m
18:	learn: 76.7189184	total: 1.17s	rem

Evaluating Models: 100%|█████████████████████████████████████████████████████████████████| 3/3 [01:36<00:00, 32.19s/it]


In [18]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
2,cat,49.60639,276.150606,46.27099,138.130234
1,lgm,50.998814,276.891305,47.054048,140.007953
0,xgb,52.687482,277.239849,49.349304,140.3871


In [19]:
import optuna

def objective(trial):
    
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.03)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    n_estimators = trial.suggest_int('n_estimators', 1000, 2000)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    alpha =  trial.suggest_loguniform('alpha', 1e-8, 10.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.4, 1.0)
   
    model = XGBRegressor(
                learning_rate=learning_rate,
                max_depth=max_depth,
                n_estimators=n_estimators,
                subsample=subsample,
                alpha = alpha,
                colsample_bytree = colsample_bytree,
                random_state=42
            )
    
    model.fit(train_combined_features, train_y)
  
    val_predict = model.predict(val_combined_features)

    val_mae = mean_absolute_error(val_y, val_predict)
    val_rmse = rmse(val_y, val_predict)
    
    return val_mae

In [20]:
study3 = optuna.create_study(direction='minimize')
study3.optimize(objective, n_trials=20) 

[32m[I 2024-11-18 09:07:59,840][0m A new study created in memory with name: no-name-1d1c1a00-4437-4160-a302-f81cb2551540[0m
[32m[I 2024-11-18 09:10:10,959][0m Trial 0 finished with value: 51.153088017952825 and parameters: {'learning_rate': 0.01415142268901411, 'max_depth': 4, 'n_estimators': 1461, 'subsample': 0.6601373872583688, 'alpha': 1.903102968952036e-06, 'colsample_bytree': 0.5353721333702279}. Best is trial 0 with value: 51.153088017952825.[0m
[32m[I 2024-11-18 09:15:33,851][0m Trial 1 finished with value: 49.49071093235258 and parameters: {'learning_rate': 0.01922958032977066, 'max_depth': 8, 'n_estimators': 1568, 'subsample': 0.8838617853752397, 'alpha': 5.1981834093639185e-05, 'colsample_bytree': 0.5866693542665287}. Best is trial 1 with value: 49.49071093235258.[0m
[32m[I 2024-11-18 09:22:22,235][0m Trial 2 finished with value: 49.780874209785544 and parameters: {'learning_rate': 0.014406196556734897, 'max_depth': 7, 'n_estimators': 1445, 'subsample': 0.85122151

In [21]:
xgb_best_param = study3.best_trial.params
print("Best trial for XGBRegressor:")
print(study3.best_trial.params)
print("Best Validation MAE:", study3.best_value)

Best trial for XGBRegressor:
{'learning_rate': 0.01651935758321106, 'max_depth': 7, 'n_estimators': 1606, 'subsample': 0.8911864740473283, 'alpha': 1.8675847821660086e-05, 'colsample_bytree': 0.4151857474379464}
Best Validation MAE: 49.46963869227454


In [22]:
best_xgb = XGBRegressor(**xgb_best_param)
best_models = [('xgb', best_xgb)]

In [23]:
res = evaluate_models(best_models)

Evaluating Models: 100%|████████████████████████████████████████████████████████████████| 1/1 [03:33<00:00, 213.01s/it]


In [24]:
res_df = pd.DataFrame(res)
res_df.sort_values(by = 'test_mae')

Unnamed: 0,model,val_mae,val_rmse,test_mae,test_rmse
0,xgb,49.533154,276.792987,45.693632,140.799742
