In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import math
from scipy import stats

pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [2]:
train_df = pd.read_csv('clean_data.csv')
val_df = pd.read_csv('clean_val.csv')
test_df = pd.read_csv('clean_test.csv')

In [3]:
train_df.dropna(inplace = True)

In [4]:
train_df.isnull().sum().sum()

0

In [5]:
with open("text.txt", "r", encoding="utf-8") as file:
    text_features = [line.strip() for line in file.readlines()]

In [6]:
text_features

['summary', 'host_about']

In [7]:
def combine_text(df):
    df['combined_text'] = df['summary'] + " [SEP] " +  df['host_about'] + " [SEP] "
    return 

In [8]:
combine_text(train_df)
combine_text(val_df)
combine_text(test_df)

In [11]:
test_df['combined_text'].iloc[2]

"We're in the US for 3 weeks [SEP] I’m an Aussie living in Uganda. [SEP] "

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, XLMRobertaTokenizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, mean_absolute_error

In [11]:
class MyDataset(Dataset):
    def __init__(self, text, normalized_label, label, tokenizer, max_len):
        self.text = text.reset_index(drop=True)
        self.label = label.reset_index(drop=True)
        self.normalized_label = normalized_label.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        normalized_label = self.normalized_label[index]
        label = self.label[index]
        encoding = self.tokenizer.encode_plus(
                    text = text,
                    padding = 'max_length',
                    truncation = True,
                    max_length=self.max_len,
                    return_tensors = 'pt',
                    return_token_type_ids = False,
                    return_attention_mask = True )
        
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'normalized_label':torch.tensor(normalized_label),
                'label':torch.tensor(label)
        }

        

In [12]:
val_df['price'].max()

9000

In [13]:
def normalize(x):
    return (x - train_df['price'].min()) / (train_df['price'].max() - train_df['price'].min())

def denormalize(x):
    return x * (train_df['price'].max() - train_df['price'].min()) + train_df['price'].min()

In [14]:
train_df['normalized_price'] = train_df['price'].apply(normalize)
val_df['normalized_price'] = val_df['price'].apply(normalize)
test_df['normalized_price'] = test_df['price'].apply(normalize)

In [15]:
train_X = train_df['combined_text']
train_norm_y = train_df['normalized_price']
train_y = train_df['price']

val_X = val_df['combined_text']
val_norm_y = val_df['normalized_price']
val_y = val_df['price']

test_X = test_df['combined_text']
test_norm_y = test_df['normalized_price']
test_y = test_df['price']

In [16]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
max_len = 128

In [17]:
batch_size = 32

train_dataset = MyDataset(train_X, train_norm_y, train_y, tokenizer, max_len)
val_dataset = MyDataset(val_X, val_norm_y, val_y, tokenizer, max_len)
test_dataset = MyDataset(test_X, test_norm_y, test_y, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = False)

In [18]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [19]:
class BertRegression(nn.Module):
    def __init__(self):
        super(BertRegression, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')
        
        for layer in self.bert_model.encoder.layer[:6]:
            for param in layer.parameters():
                param.requires_grad = False
        
        for layer in self.bert_model.encoder.layer[6:]:
            for param in layer.parameters():
                param.requires_grad = True  
            
        self.drop_out = nn.Dropout(0.2)
        self.linear = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        output = self.bert_model(input_ids, attention_mask)
        output = self.drop_out(output.pooler_output)
        logits = self.linear(output)
        return logits
        

In [20]:
model = BertRegression()
model.to(device)

learning_rate = 1e-03 
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)

criterion = nn.MSELoss()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
import time
import math
from tqdm import tqdm

def rmse(y, y_predict):
    mse = mean_squared_error(y, y_predict)
    return math.sqrt(mse)

def train(model, data_loader, optimizer, criterion, device):
    
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    for batch in tqdm(data_loader, desc="Training", unit="batch"):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
        labels = batch['label'].float().unsqueeze(1).to(device)
        
        outputs = model(input_ids, attention_mask)
        
        loss = criterion(outputs, normalized_label)
        total_loss += loss.item()
        
        loss.backward()
        
        optimizer.step()
        
        outputs = denormalize(outputs)
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())
        
    avg_loss = total_loss/len(data_loader)
    train_mae = mean_absolute_error(all_labels, all_preds)
    train_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Training Loss: {avg_loss:.4f}, Train MAE: {train_mae:.4f}, Train RMSE: {train_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    return avg_loss, train_mae

def evaluate(model, data_loader, criterion, device):
    
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    start_time = time.time()
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation", unit="batch"):
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
            labels = batch['label'].float().unsqueeze(1).to(device)
            
            outputs = model(input_ids, attention_mask)
            
            loss = criterion(outputs, normalized_label)
            total_loss += loss.item()
            
            outputs = denormalize(outputs)
            
            all_preds.extend(outputs.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    

    avg_loss = total_loss/len(data_loader)
    val_mae = mean_absolute_error(all_labels, all_preds)
    val_rmse = rmse(all_labels, all_preds)

    end_time = time.time()  
    epoch_duration = end_time - start_time  
    
    print(f'Validation Loss: {avg_loss:.4f}, Validation MAE: {val_mae:.4f}, Validation RMSE: {val_rmse:.4f}, Time: {epoch_duration:.2f} seconds')
    
    return avg_loss, val_mae

In [22]:
best_val_loss = float('inf')  
best_val_mae = float('inf')              
model_save_path = "1_.pth"

num_epochs = 5

train_losses = []
val_losses = []
train_mae_scores = []
val_mae_scores = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    train_loss, train_mae = train(model, train_loader, optimizer, criterion, device)
    val_loss, val_mae = evaluate(model, val_loader, criterion, device)
    
    train_losses.append(train_loss)  
    train_mae_scores.append(train_mae)  
    val_losses.append(val_loss)  
    val_mae_scores.append(val_mae)  
    
    if val_mae < best_val_mae:
        best_val_mae = val_mae
        torch.save(model.state_dict(), model_save_path)   
        print(f"Best model saved with validation MAE: {best_val_mae:.4f}")

    print('-' * 100)

Epoch 1/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:30<00:00,  1.70batch/s]


Training Loss: 0.0783, Train MAE: 164.7023, Train RMSE: 224.6885, Time: 330.58 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:19<00:00,  3.61batch/s]


Validation Loss: 0.1322, Validation MAE: 82.5310, Validation RMSE: 292.9271, Time: 19.97 seconds
Best model saved with validation MAE: 82.5310
----------------------------------------------------------------------------------------------------
Epoch 2/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [06:01<00:00,  1.55batch/s]


Training Loss: 0.0234, Train MAE: 88.8177, Train RMSE: 122.7873, Time: 361.35 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:20<00:00,  3.51batch/s]


Validation Loss: 0.1313, Validation MAE: 93.4966, Validation RMSE: 291.9732, Time: 20.49 seconds
----------------------------------------------------------------------------------------------------
Epoch 3/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [05:58<00:00,  1.56batch/s]


Training Loss: 0.0168, Train MAE: 70.3629, Train RMSE: 103.9427, Time: 358.89 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [00:20<00:00,  3.54batch/s]


Validation Loss: 0.1321, Validation MAE: 82.7987, Validation RMSE: 292.7905, Time: 20.34 seconds
----------------------------------------------------------------------------------------------------
Epoch 4/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [21:27<00:00,  2.30s/batch]


Training Loss: 0.0166, Train MAE: 69.6420, Train RMSE: 103.3790, Time: 1287.86 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:08<00:00,  1.05batch/s]


Validation Loss: 0.1319, Validation MAE: 85.7592, Validation RMSE: 292.0211, Time: 68.45 seconds
----------------------------------------------------------------------------------------------------
Epoch 5/5


Training: 100%|███████████████████████████████████████████████████████████████████| 561/561 [26:33<00:00,  2.84s/batch]


Training Loss: 0.0164, Train MAE: 69.4200, Train RMSE: 102.8703, Time: 1593.20 seconds


Validation: 100%|███████████████████████████████████████████████████████████████████| 72/72 [01:38<00:00,  1.37s/batch]


Validation Loss: 0.1332, Validation MAE: 81.4038, Validation RMSE: 294.0677, Time: 98.42 seconds
Best model saved with validation MAE: 81.4038
----------------------------------------------------------------------------------------------------


In [23]:
model1 = BertRegression()
model1.to(device)

model1.load_state_dict(torch.load(model_save_path))
model1.to(device)
all_labels = []
all_preds = []

start_time = time.time()

for batch in tqdm(test_loader, desc="Testing", unit="batch"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    normalized_label = batch['normalized_label'].float().unsqueeze(1).to(device)
    labels = batch['label'].float().unsqueeze(1).to(device)
    
    with torch.no_grad():
        outputs = model1(input_ids, attention_mask)
        outputs = denormalize(outputs)
        
        all_preds.extend(outputs.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())  
        
all_labels = np.array(all_labels).flatten()
all_preds = np.array(all_preds).flatten()
        
test_mae = mean_absolute_error(all_labels, all_preds)
test_rmse = rmse(all_labels, all_preds)

end_time = time.time()  
epoch_duration = end_time - start_time

print(f'Testing MAE: {test_mae:.4f}, Testing RMSE: {test_rmse:.4f}, Time: {epoch_duration:.2f} seconds')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Testing: 100%|██████████████████████████████████████████████████████████████████████| 72/72 [01:40<00:00,  1.39s/batch]

Testing MAE: 77.3538, Testing RMSE: 178.6278, Time: 100.41 seconds



