In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
train_df.isnull().sum().sum()

0

In [5]:
with open("numerical.txt", "r", encoding="utf-8") as file:
    numeric_features = [line.strip() for line in file.readlines()]
    
with open("cat.txt", "r", encoding="utf-8") as file:
    cat_features = [line.strip() for line in file.readlines()]
    
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [6]:
def combine_text(df):
    df['combined_text'] = df['summary'] + " [SEP] "  + df['host_about'] + " [SEP] "
    return 

In [7]:
combine_text(train_df)
combine_text(val_df)
combine_text(test_df)

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error
from transformers import BertTokenizer, BertModel


In [9]:
class BoxCoxTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lambdas_ = {}

    def fit(self, X, y=None):
        X = X.copy()
        for column in X.columns:
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            _, lambda_optimal = stats.boxcox(X[column])
            self.lambdas_[column] = lambda_optimal
        return self

    def transform(self, X):
        X = X.copy()
        for column, lambda_optimal in self.lambdas_.items():
            if any(X[column] <= 0):
                X[column] = X[column] + abs(X[column].min()) + 1
            X[column] = stats.boxcox(X[column], lmbda=lambda_optimal)
        return X

In [10]:
numeric_transformer = Pipeline([
    ('boxcox', BoxCoxTransformer()),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

preprocessor.fit(train_df)


ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('boxcox',
                                                  BoxCoxTransformer()),
                                                 ('scaler', StandardScaler())]),
                                 ['accommodates', 'bathrooms', 'bedrooms',
                                  'beds', 'cleaning_fee', 'guests_included',
                                  'availability_30', 'availability_365',
                                  'number_of_reviews']),
                                ('cat', OneHotEncoder(handle_unknown='ignore'),
                                 ['suburb', 'property_type', 'city',
                                  'room_type', 'doub...
                                  'tennis_court', 'elevator',
                                  'family/kid_friendly', 'tv',
                                  'air_conditioning', 'high_chair',
                                  'dishwasher', 'brick_oven',
     

In [11]:
class MyDataset(Dataset):
    def __init__(self, data, text, normalized_label, target, tokenizer, max_len, preprocessor):
        self.text = text.reset_index(drop=True)
        self.normalized_label = normalized_label.reset_index(drop=True)
        self.data = preprocessor.transform(data)
        self.target = target.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.target)

    def __getitem__(self, index):
        text = self.text.iloc[index]
        normalized_label = self.normalized_label[index]
        data = torch.tensor(self.data[index].toarray(), dtype=torch.float32) 
        encoding = self.tokenizer.encode_plus(
                    text = text,
                    padding = 'max_length',
                    truncation = True,
                    max_length=self.max_len,
                    return_tensors = 'pt',
                    return_token_type_ids = False,
                    return_attention_mask = True )
        target = self.target[index] 
        
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'data': data,
                'normalized_label':torch.tensor(normalized_label),
                'target':torch.tensor(target)
        }


In [12]:
def normalize(x):
    return (x - train_df['price'].min()) / (train_df['price'].max() - train_df['price'].min())

def denormalize(x):
    return x * (train_df['price'].max() - train_df['price'].min()) + train_df['price'].min()

In [13]:
train_df['normalized_price'] = train_df['price'].apply(normalize)
val_df['normalized_price'] = val_df['price'].apply(normalize)
test_df['normalized_price'] = test_df['price'].apply(normalize)

In [14]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
max_len = 128

In [15]:
batch_size = 32

train_data_X = train_df[numeric_features + cat_features]
train_text_X = train_df['combined_text']
train_norm_y = train_df['normalized_price']
train_y = train_df['price']

val_data_X = val_df[numeric_features + cat_features]
val_text_X = train_df['combined_text']
val_norm_y = val_df['normalized_price']
val_y = val_df['price']

test_data_X = test_df[numeric_features + cat_features]
test_text_X = train_df['combined_text']
test_norm_y = test_df['normalized_price']
test_y = test_df['price']

train_dataset = MyDataset(train_data_X, train_text_X, train_norm_y, train_y, tokenizer, max_len, preprocessor=preprocessor)
val_dataset = MyDataset(val_data_X, val_text_X, val_norm_y, val_y, tokenizer, max_len, preprocessor=preprocessor)
test_dataset = MyDataset(test_data_X, test_text_X, test_norm_y, test_y, tokenizer, max_len, preprocessor=preprocessor)

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)


In [16]:
class MultiModal_RegressionModel(nn.Module):
    def __init__(self, num_numerical_features):
        super(MultiModal_RegressionModel, self).__init__()
        
        self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
        bert_output_dim = self.bert_model.config.hidden_size # 768
        
        for layer in self.bert_model.encoder.layer[:6]:
            for param in layer.parameters():
                param.requires_grad = False
        
        for layer in self.bert_model.encoder.layer[6:]:
            for param in layer.parameters():
                param.requires_grad = True  
                
        self.mlp = nn.Sequential(
            nn.Linear(num_numerical_features, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),
            
        )
        
        self.fc = nn.Sequential(
            nn.Linear(bert_output_dim + 32, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
    def forward(self, input_ids, attention_mask, numerical_features):
        
        bert_outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = bert_outputs.pooler_output  

        num_features_output = self.mlp(numerical_features)  

     
        combined_features = torch.cat((text_features, num_features_output), dim=1) 

        output = self.fc(combined_features) 
        return output

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [18]:
preprocessor.transform(train_data_X).shape[1]

723

In [19]:
input_dim = preprocessor.transform(train_data_X).shape[1]
model = MultiModal_RegressionModel(input_dim)
model = model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [21]:
import time
import math
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def train(model, data_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        data = batch['data'].view(batch['data'].size(0), -1).to(device)
        normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
        targets = batch['target'].float().unsqueeze(1).to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask, data)
       
        loss = criterion(outputs, normalized_label)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        outputs = denormalize(outputs)
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(targets.cpu().tolist())
        
    avg_loss = total_loss/len(data_loader)
    train_mae = mean_absolute_error(all_labels, all_preds)
    train_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Training Loss: {avg_loss:.4f}, Train MAE: {train_mae:.4f}, Train RMSE: {train_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    return avg_loss, train_mae

def evaluate(model, data_loader, criterion, device):
    
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation", unit="batch"):
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            data = batch['data'].view(batch['data'].size(0), -1).to(device)
            normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
            targets = batch['target'].float().unsqueeze(1).to(device)
            
            outputs = model(input_ids, attention_mask, data)
            
            loss = criterion(outputs, normalized_label)
            total_loss += loss.item()
            
            outputs = denormalize(outputs)
            
            all_preds.extend(outputs.cpu().tolist())
            all_labels.extend(targets.cpu().tolist())
    

    avg_loss = total_loss/len(data_loader)
    val_mae = mean_absolute_error(all_labels, all_preds)
    val_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Validation Loss: {avg_loss:.4f}, Validation MAE: {val_mae:.4f}, Validation RMSE: {val_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    
    return avg_loss, val_mae

In [22]:
best_val_loss = float('inf')  
best_val_mae = float('inf')              
model_save_path = "4.pth"

num_epochs = 10

train_losses = []
val_losses = []
train_mae_scores = []
val_mae_scores = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    train_loss, train_mae = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_mae = evaluate(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)  
    train_mae_scores.append(train_mae)  
    val_losses.append(val_loss)  
    val_mae_scores.append(val_mae)  
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), model_save_path)   
        print(f"Best model saved with validation MAE: {best_val_mae:.4f}")

    print('-' * 100)

Epoch 1/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:18<00:00,  1.76batch/s]


Training Loss: 0.0165, Train MAE: 60.7080, Train RMSE: 103.1897, Time: 318.32 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:18<00:00,  3.82batch/s]


Validation Loss: 0.1211, Validation MAE: 55.2290, Validation RMSE: 280.3137, Time: 18.87 seconds
Best model saved with validation MAE: 55.2290
----------------------------------------------------------------------------------------------------
Epoch 2/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:40<00:00,  1.65batch/s]


Training Loss: 0.0082, Train MAE: 44.2974, Train RMSE: 72.7588, Time: 340.17 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.72batch/s]


Validation Loss: 0.1242, Validation MAE: 67.2481, Validation RMSE: 284.0526, Time: 19.34 seconds
----------------------------------------------------------------------------------------------------
Epoch 3/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:41<00:00,  1.64batch/s]


Training Loss: 0.0077, Train MAE: 42.4591, Train RMSE: 70.4942, Time: 341.20 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.77batch/s]


Validation Loss: 0.1191, Validation MAE: 53.9015, Validation RMSE: 278.1566, Time: 19.11 seconds
Best model saved with validation MAE: 53.9015
----------------------------------------------------------------------------------------------------
Epoch 4/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.62batch/s]


Training Loss: 0.0075, Train MAE: 42.0722, Train RMSE: 69.3298, Time: 345.53 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.72batch/s]


Validation Loss: 0.1191, Validation MAE: 52.1209, Validation RMSE: 278.1376, Time: 19.38 seconds
Best model saved with validation MAE: 52.1209
----------------------------------------------------------------------------------------------------
Epoch 5/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.62batch/s]


Training Loss: 0.0071, Train MAE: 41.2544, Train RMSE: 67.7611, Time: 345.50 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.61batch/s]


Validation Loss: 0.1188, Validation MAE: 55.3955, Validation RMSE: 277.7161, Time: 19.96 seconds
----------------------------------------------------------------------------------------------------
Epoch 6/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.63batch/s]


Training Loss: 0.0068, Train MAE: 40.4724, Train RMSE: 66.2298, Time: 345.14 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.70batch/s]


Validation Loss: 0.1212, Validation MAE: 56.3580, Validation RMSE: 280.5236, Time: 19.48 seconds
----------------------------------------------------------------------------------------------------
Epoch 7/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.63batch/s]


Training Loss: 0.0066, Train MAE: 39.7789, Train RMSE: 65.3970, Time: 345.21 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.73batch/s]


Validation Loss: 0.1188, Validation MAE: 58.1653, Validation RMSE: 277.6084, Time: 19.31 seconds
----------------------------------------------------------------------------------------------------
Epoch 8/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.62batch/s]


Training Loss: 0.0064, Train MAE: 39.6837, Train RMSE: 64.4249, Time: 345.80 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.72batch/s]


Validation Loss: 0.1190, Validation MAE: 54.0286, Validation RMSE: 277.9683, Time: 19.38 seconds
----------------------------------------------------------------------------------------------------
Epoch 9/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:46<00:00,  1.62batch/s]


Training Loss: 0.0064, Train MAE: 39.0541, Train RMSE: 63.9878, Time: 346.64 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.71batch/s]


Validation Loss: 0.1183, Validation MAE: 53.2976, Validation RMSE: 277.1546, Time: 19.44 seconds
----------------------------------------------------------------------------------------------------
Epoch 10/10


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:45<00:00,  1.62batch/s]


Training Loss: 0.0060, Train MAE: 38.2230, Train RMSE: 62.4193, Time: 345.77 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.73batch/s]

Validation Loss: 0.1181, Validation MAE: 52.2401, Validation RMSE: 277.0105, Time: 19.30 seconds
----------------------------------------------------------------------------------------------------





In [23]:
model1 = MultiModal_RegressionModel(input_dim)
model1.to(device)

model1.load_state_dict(torch.load(model_save_path))
model1.to(device)
all_labels = []
all_preds = []

start_time = time.time()

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing", unit="batch"):

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        data = batch['data'].view(batch['data'].size(0), -1).to(device)
        normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
        targets = batch['target'].float().unsqueeze(1).to(device)
            
        outputs = model(input_ids, attention_mask, data)
        outputs = denormalize(outputs)
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(targets.cpu().tolist())
        
all_labels = np.array(all_labels).flatten()
all_preds = np.array(all_preds).flatten()
        
test_mae = mean_absolute_error(all_labels, all_preds)
test_rmse = rmse(all_labels, all_preds)

end_time = time.time()  
epoch_duration = end_time - start_time

print(f'Testing MAE: {test_mae:.4f}, Testing RMSE: {test_rmse:.4f}, Time: {epoch_duration:.2f} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Testing: 100%|██████████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.76batch/s]

Testing MAE: 47.2100, Testing RMSE: 137.6067, Time: 19.17 seconds



