In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
train_df.isnull().sum().sum()

0

In [5]:
with open("numerical.txt", "r", encoding="utf-8") as file:
    numeric_features = [line.strip() for line in file.readlines()]
    
with open("cat.txt", "r", encoding="utf-8") as file:
    cat_features = [line.strip() for line in file.readlines()]
    
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [6]:
def num2text(df):
    sent = []

    for _, row in df.iterrows():
        sent.append(f"It accommodates to {int(row['accommodates'])} guests, with {int(row['bedrooms'])} bedrooms, {int(row['bathrooms'])} bathrooms, and {int(row['beds'])} beds. The cleaning fee costs ${row['cleaning_fee']}. Available for {int(row['availability_30'])} days in the last 30 days and {int(row['availability_365'])} days in the last year. {int(row['number_of_reviews'])} review.")
    return sent

def cat2text(df):
    result = []
    sentences = []

    for index, row in df[cat_features].iterrows():
        cols_with_t = df[cat_features].columns[row == 't'].tolist()

        result.append(cols_with_t)

    for i in range(len(result)):
        if len(result[i]) > 0:
            if len(result[i]) > 1:
                sentence = f"This Airbnb offers amenities such as: {', '.join(result[i][:-1])}, and {result[i][-1]}."
            else:
                sentence = f"TThis Airbnb offers amenities such as: {result[i][0]}."
        else:
            sentence = "This Airbnb has no additional features."

        sentences.append(sentence)
    return sentences

def combine_text(df):
    df['combined_text'] = df['summary'] + " [SEP] "  + df['host_about'] + " [SEP] " + [cat + " " + num for cat, num in zip(cat2text(df), num2text(df))]
    return


In [7]:
combine_text(train_df)
combine_text(val_df)
combine_text(test_df)

In [12]:
train_df['combined_text'].iloc[3]

"The space is separated from the living room by a screen. However it is as big as a bedroom and has a 'window' (actually it's a door) towards the balcony (east-facing). There are a desk, a chair, a lamp, a double size mattress & a portable wardrobe. You will have two flat mates in their 20s and a bunny pet to keep you company. Please NOTE that the living room are affected by NOISE and SMELL from cooking. [SEP] Unknown [SEP] This Airbnb offers amenities such as: hair_dryer, shampoo, heating, and smoking_allowed. It accommodates to 2 guests, with 2 bedrooms, 2 bathrooms, and 2 beds. The cleaning fee costs $11.0. Available for 0 days in the last 30 days and 0 days in the last year. 19 review."

In [None]:
def normalize(x):
    return (x - train_df['price'].min()) / (train_df['price'].max() - train_df['price'].min())

def denormalize(x):
    return x * (train_df['price'].max() - train_df['price'].min()) + train_df['price'].min()

In [None]:
train_df['normalized_price'] = train_df['price'].apply(normalize)
val_df['normalized_price'] = val_df['price'].apply(normalize)
test_df['normalized_price'] = test_df['price'].apply(normalize)

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, XLMRobertaTokenizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [10]:
class MyDataset(Dataset):
    def __init__(self, text, label, tokenizer, max_len):
        self.text = text.reset_index(drop=True)
        self.label = label.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        label = self.label[index]
        encoding = self.tokenizer.encode_plus(
                    text = text,
                    padding = 'max_length',
                    truncation = True,
                    max_length=self.max_len,
                    return_tensors = 'pt',
                    return_token_type_ids = False,
                    return_attention_mask = True )
        
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'label':torch.tensor(label)
        }

        

In [11]:
train_X = train_df['combined_text']
train_norm_y = train_df['normalized_price']
train_y = train_df['price']

val_X = val_df['combined_text']
val_norm_y = val_df['normalized_price']
val_y = val_df['price']

test_X = test_df['combined_text']
test_norm_y = test_df['normalized_price']
test_y = test_df['price']

In [12]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
max_len = 128

In [13]:
batch_size = 32

train_dataset = MyDataset(train_X, train_norm_y, train_y, tokenizer, max_len)
val_dataset = MyDataset(val_X, val_norm_y, val_y, tokenizer, max_len)
test_dataset = MyDataset(test_X, test_norm_y, test_y, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [14]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [15]:
class BertRegression(nn.Module):
    def __init__(self):
        super(BertRegression, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
        
        for layer in self.bert_model.encoder.layer[:6]:
            for param in layer.parameters():
                param.requires_grad = False
        
        for layer in self.bert_model.encoder.layer[6:]:
            for param in layer.parameters():
                param.requires_grad = True  
            
        self.drop_out = nn.Dropout(0.2)
        self.linear = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        output = self.drop_out(output.pooler_output)
        logits = self.linear(output)
        return logits
        

In [16]:
model = BertRegression()
model.to(device)

learning_rate = 1e-03 
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

criterion = nn.MSELoss()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
import time
import math
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def train(model, data_loader, optimizer, criterion, device):
    
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().unsqueeze(1).to(device)
        
        outputs = model(input_ids, attention_mask)
        
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        
        loss.backward()
        
        optimizer.step()
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        
    avg_loss = total_loss/len(data_loader)
    train_mae = mean_absolute_error(all_labels, all_preds)
    train_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Training Loss: {avg_loss:.4f}, Train MAE: {train_mae:.4f}, Train RMSE: {train_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    return avg_loss, train_mae

def evaluate(model, data_loader, criterion, device):
    
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation", unit="batch"):
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float().unsqueeze(1).to(device)
            
            outputs = model(input_ids, attention_mask)
            
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            
            all_preds.extend(outputs.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    

    avg_loss = total_loss/len(data_loader)
    val_mae = mean_absolute_error(all_labels, all_preds)
    val_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Validation Loss: {avg_loss:.4f}, Validation MAE: {val_mae:.4f}, Validation RMSE: {val_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    
    return avg_loss, val_mae

In [18]:
best_val_loss = float('inf')  
best_val_mae = float('inf')              
model_save_path = "13_.pth"

num_epochs = 5

train_losses = []
val_losses = []
train_mae_scores = []
val_mae_scores = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    train_loss, train_mae = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_mae = evaluate(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)  
    train_mae_scores.append(train_mae)  
    val_losses.append(val_loss)  
    val_mae_scores.append(val_mae)  
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), model_save_path)   
        print(f"Best model saved with validation MAE: {best_val_mae:.4f}")

    print('-' * 100)

Epoch 1/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [43:05<00:00,  4.61s/batch]


Training Loss: 12154.3024, Train MAE: 72.1284, Train RMSE: 110.2682, Time: 2585.11 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:40<00:00,  1.40s/batch]


Validation Loss: 84866.8825, Validation MAE: 84.4131, Validation RMSE: 292.2609, Time: 100.82 seconds
Best model saved with validation MAE: 84.4131
----------------------------------------------------------------------------------------------------
Epoch 2/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [46:29<00:00,  4.97s/batch]


Training Loss: 10378.2179, Train MAE: 68.4434, Train RMSE: 101.8886, Time: 2789.59 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:56<00:00,  1.62s/batch]


Validation Loss: 84916.2363, Validation MAE: 84.4649, Validation RMSE: 292.2483, Time: 116.85 seconds
----------------------------------------------------------------------------------------------------
Epoch 3/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [44:38<00:00,  4.77s/batch]


Training Loss: 10382.2047, Train MAE: 68.5649, Train RMSE: 101.9087, Time: 2678.11 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:40<00:00,  1.40s/batch]


Validation Loss: 84988.9237, Validation MAE: 84.0588, Validation RMSE: 292.3521, Time: 100.87 seconds
Best model saved with validation MAE: 84.0588
----------------------------------------------------------------------------------------------------
Epoch 4/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [43:05<00:00,  4.61s/batch]


Training Loss: 10381.5749, Train MAE: 68.5001, Train RMSE: 101.9026, Time: 2585.51 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:40<00:00,  1.40s/batch]


Validation Loss: 85010.1027, Validation MAE: 83.6266, Validation RMSE: 292.4763, Time: 100.85 seconds
Best model saved with validation MAE: 83.6266
----------------------------------------------------------------------------------------------------
Epoch 5/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [45:28<00:00,  4.86s/batch]


Training Loss: 10382.8695, Train MAE: 68.4823, Train RMSE: 101.9073, Time: 2728.58 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:57<00:00,  1.63s/batch]

Validation Loss: 84900.0454, Validation MAE: 84.4307, Validation RMSE: 292.2566, Time: 117.16 seconds
----------------------------------------------------------------------------------------------------





In [19]:
model1 = BertRegression()
model1.to(device)

model1.load_state_dict(torch.load(model_save_path))
model1.to(device)
all_labels = []
all_preds = []

start_time = time.time()

for batch in tqdm(test_loader, desc="Testing", unit="batch"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].float().unsqueeze(1).to(device)
    
    with torch.no_grad():
        outputs = model1(input_ids, attention_mask)
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())  
        
all_labels = np.array(all_labels).flatten()
all_preds = np.array(all_preds).flatten()
        
test_mae = mean_absolute_error(all_labels, all_preds)
test_rmse = rmse(all_labels, all_preds)

end_time = time.time()  
epoch_duration = end_time - start_time

print(f'Testing MAE: {test_mae:.4f}, Testing RMSE: {test_rmse:.4f}, Time: {epoch_duration:.2f} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Testing: 100%|██████████████████████████████████████████████████████████████████████| 72/72 [02:11<00:00,  1.83s/batch]

Testing MAE: 78.4469, Testing RMSE: 176.1595, Time: 131.82 seconds



