# 1. Import Packages

# 0. TODO

* [x] Evaluation metrics for ranking: accuracy@k, NDCG, Kendall's tau
* [x] try new loss function (great loss on bigger cascade)
* [x] BERT fine-tune
* [x] Add other post's features: text entity count
* [x] Make train / validation / test splitting fixed
* [x] Check labels distribution for top-k cascades (because HR is poor)
* [x] Do feature normalization before feeding into MLP
* [ ] Up-sample inbalanced data
* [x] Text data preprocessing
* [x] Add temporal features

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers as ppb
from transformers import AdamW

import matplotlib.pyplot as plt

import warnings
import os.path as path

import sklearn
import scipy

setup:

In [None]:
# print messages
warnings.filterwarnings('ignore')

# device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device type: {device.type}")

# 2. Import Dataset

In [None]:
DATASET_NEWS = "./data/all/news.csv"

In [None]:
news_df = pd.read_csv(DATASET_NEWS, header = 0)
print(f"Columns: {news_df.columns}")
print(f"news set size: {len(news_df)}")

In [None]:
DATASET_TRAIN = "./data/splitted/source_tweet_processed_train.csv"
DATASET_CV = "./data/splitted/source_tweet_processed_cv.csv"
DATASET_TEST = "./data/splitted/source_tweet_processed_test.csv"

In [None]:
train_df = pd.read_csv(DATASET_TRAIN, header = 0)
cv_df = pd.read_csv(DATASET_CV, header = 0)
test_df = pd.read_csv(DATASET_TEST, header = 0)

print(f"Columns: {train_df.columns}")
print(f"train set size: {len(train_df)}")
print(f"cross validation set size: {len(cv_df)}")
print(f"test set size: {len(test_df)}")

Check for label distribution:

In [None]:
train_label_counts = train_df['cascade_size'].value_counts()
cv_label_counts = cv_df['cascade_size'].value_counts()
test_label_counts = test_df['cascade_size'].value_counts()

print(f"train set: label=1: {train_label_counts[1]/len(train_df):.4f}, label=2: {train_label_counts[2]/len(train_df):.4f}, label=3: {train_label_counts[3]/len(train_df):.4f}, label=4: {train_label_counts[4]/len(train_df):.4f}")
print(f"cv set: label=1: {cv_label_counts[1]/len(cv_df):.4f}, label=2: {cv_label_counts[2]/len(cv_df):.4f}, label=3: {cv_label_counts[3]/len(cv_df):.4f}, label=4: {cv_label_counts[4]/len(cv_df):.4f}")
print(f"test set: label=1: {test_label_counts[1]/len(test_df):.4f}, label=2: {test_label_counts[2]/len(test_df):.4f}, label=3: {test_label_counts[3]/len(test_df):.4f}, label=4: {test_label_counts[4]/len(test_df):.4f}")

# 3. Data Preprocessing

### News Text Features

tokenize:

In [None]:
# load pretrained tokenizer

# For DistilBERT:
tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
news_title_tokenized = news_df['title'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
print(f"news title tokenized shape: {news_title_tokenized.shape}")

news_text_tokenized = news_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
print(f"news test tokenized shape: {news_text_tokenized.shape}")

padding:

In [None]:
total_length = 0

for i in news_title_tokenized.values:
    total_length += len(i)
    
avg_news_title_length = int(total_length / news_title_tokenized.shape[0])
print(avg_news_title_length)

In [None]:
total_length = 0

for i in news_text_tokenized.values:
    total_length += len(i)
    
avg_news_text_length = int(total_length / news_text_tokenized.shape[0])
print(avg_news_text_length)

In [None]:
NEWS_TITLE_LENGTH = 20
NEWS_TEXT_LENGTH = 100

In [None]:
# truncate news title and text 
for i in range(len(news_title_tokenized)):
    news_title_tokenized[i] = news_title_tokenized[i][:NEWS_TITLE_LENGTH]
for i in range(len(news_text_tokenized)):
    news_text_tokenized[i] = news_text_tokenized[i][:NEWS_TEXT_LENGTH]

In [None]:
news_title_padded = np.array([i + [0] * (NEWS_TITLE_LENGTH - len(i)) for i in news_title_tokenized.values])
news_text_padded = np.array([i + [0] * (NEWS_TEXT_LENGTH - len(i)) for i in news_text_tokenized.values])

print(f"news_title_padded: {news_title_padded.shape}")
print(f"news_text_padded: {news_text_padded.shape}")

masking:

In [None]:
news_title_attention_mask = np.where(news_title_padded != 0, 1, 0)
news_text_attention_mask = np.where(news_text_padded != 0, 1, 0)

Put it into GPU:

In [None]:
NEWS_NUMERIC_COLUMNS = ['img_count']

In [None]:
train_news_text_tensors = []
cv_news_text_tensors = []
test_news_text_tensors = []
train_news_text_mask = []
cv_news_text_mask = []
test_news_text_mask = []

train_news_title_tensors = []
cv_news_title_tensors = []
test_news_title_tensors = []
train_news_title_mask = []
cv_news_title_mask = []
test_news_title_mask = []

train_news_numeric_tensors = []
cv_news_numeric_tensors = []
test_news_numeric_tensors = []

for idx, row in train_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    train_news_text_tensors.append(news_text_padded[news_row_id])
    train_news_text_mask.append(news_text_padded[news_row_id])
    train_news_title_tensors.append(news_title_padded[news_row_id])
    train_news_title_mask.append(news_title_padded[news_row_id])
    
    train_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in cv_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    cv_news_text_tensors.append(news_text_padded[news_row_id])
    cv_news_text_mask.append(news_text_padded[news_row_id])
    cv_news_title_tensors.append(news_title_padded[news_row_id])
    cv_news_title_mask.append(news_title_padded[news_row_id])
    
    cv_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in test_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    test_news_text_tensors.append(news_text_padded[news_row_id])
    test_news_text_mask.append(news_text_padded[news_row_id])
    test_news_title_tensors.append(news_title_padded[news_row_id])
    test_news_title_mask.append(news_title_padded[news_row_id])
    
    test_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)

In [None]:
train_news_text_tensors = torch.tensor(train_news_text_tensors, dtype=torch.long).to(device)
cv_news_text_tensors = torch.tensor(cv_news_text_tensors, dtype=torch.long).to(device)
test_news_text_tensors = torch.tensor(test_news_text_tensors, dtype=torch.long).to(device)
train_news_text_mask = torch.tensor(train_news_text_mask, dtype=torch.long).to(device)
cv_news_text_mask = torch.tensor(cv_news_text_mask, dtype=torch.long).to(device)
test_news_text_mask = torch.tensor(test_news_text_mask, dtype=torch.long).to(device)

train_news_title_tensors = torch.tensor(train_news_title_tensors, dtype=torch.long).to(device)
cv_news_title_tensors = torch.tensor(cv_news_title_tensors, dtype=torch.long).to(device)
test_news_title_tensors = torch.tensor(test_news_title_tensors, dtype=torch.long).to(device)
train_news_title_mask = torch.tensor(train_news_title_mask, dtype=torch.long).to(device)
cv_news_title_mask = torch.tensor(cv_news_title_mask, dtype=torch.long).to(device)
test_news_title_mask = torch.tensor(test_news_title_mask, dtype=torch.long).to(device)

train_news_numeric_tensors = torch.tensor(train_news_numeric_tensors, dtype=torch.long).to(device)
cv_news_numeric_tensors = torch.tensor(cv_news_numeric_tensors, dtype=torch.long).to(device)
test_news_numeric_tensors = torch.tensor(test_news_numeric_tensors, dtype=torch.long).to(device)

### Post Text Features

Tokenize:

In [None]:
train_tokenized = train_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
cv_tokenized = cv_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
test_tokenized = test_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))

print(f"train tokenized shape: {train_tokenized.shape}")

Padding:

In [None]:
total_length = 0

for i in train_tokenized.values:
    total_length += len(i)
for i in cv_tokenized.values:
    total_length += len(i)
for i in test_tokenized.values:
    total_length += len(i)
    
average_length = int(total_length / (train_tokenized.shape[0] + cv_tokenized.shape[0] + test_tokenized.shape[0]))
print(average_length)

In [None]:
lengths = []
for i in train_tokenized.values:
    lengths.append(len(i))
for i in cv_tokenized.values:
    lengths.append(len(i))
for i in test_tokenized.values:
    lengths.append(len(i))
    
median_length = np.median(np.array(lengths))

In [None]:
# find max length among all sentences

max_len = 0
for i in train_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

for i in cv_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

for i in test_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
print(f"Max sentence length: {max_len}")

In [None]:
# we set a fixed text length, which is between max sentence length and average length
TEXT_LENGTH = int(median_length)

In [None]:
# truncate description using average length
for i in range(len(train_tokenized)):
    train_tokenized[i] = train_tokenized[i][:TEXT_LENGTH]
for i in range(len(cv_tokenized)):
    cv_tokenized[i] = cv_tokenized[i][:TEXT_LENGTH]
for i in range(len(test_tokenized)):
    test_tokenized[i] = test_tokenized[i][:TEXT_LENGTH]

In [None]:
train_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in train_tokenized.values])
cv_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in cv_tokenized.values])
test_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in test_tokenized.values])

print(f"train_padded: {train_padded.shape}")

Masking:

In [None]:
train_attention_mask = np.where(train_padded != 0, 1, 0)
cv_attention_mask = np.where(cv_padded != 0, 1, 0)
test_attention_mask = np.where(test_padded != 0, 1, 0)

Put into GPU:

In [None]:
train_text_tensors = torch.tensor(train_padded, dtype=torch.long).to(device)
cv_text_tensors = torch.tensor(cv_padded, dtype=torch.long).to(device)
test_text_tensors = torch.tensor(test_padded, dtype=torch.long).to(device)

train_text_mask = torch.tensor(train_attention_mask, dtype=torch.long).to(device)
cv_text_mask = torch.tensor(cv_attention_mask, dtype=torch.long).to(device)
test_text_mask = torch.tensor(test_attention_mask, dtype=torch.long).to(device)

### Numeric Features

In [None]:
statistic_columns = ['user_count', 'tag_count', 'symbol_count', 'url_count', 'sentence_count']

wds = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
temporal_columns = [f"h_{h:02}" for h in range(0,24)] + [f"wday_{wd}" for wd in wds] + ['is_holiday']

sentiment_columns = ['avg_sentiment_score', 'sentiment_ratio', 'pos_count', 'neg_count']

numeric_columns = statistic_columns + temporal_columns + sentiment_columns

In [None]:
train_numeric_tensors = torch.tensor(train_df[numeric_columns].values, dtype=torch.float).to(device)
cv_numeric_tensors = torch.tensor(cv_df[numeric_columns].values, dtype=torch.float).to(device)
test_numeric_tensors = torch.tensor(test_df[numeric_columns].values, dtype=torch.float).to(device)

print(train_numeric_tensors.shape)
print(cv_numeric_tensors.shape)
print(test_numeric_tensors.shape)

### Labels

In [None]:
train_labels = torch.tensor(train_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
cv_labels = torch.tensor(cv_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
test_labels = torch.tensor(test_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)

print(train_labels.shape)

## Upsampling the minor

In [None]:
# shifted = torch.clone(train_labels).squeeze(1)
# shifted -= 1
# nonzero_indexs = torch.nonzero(shifted).squeeze(1)

In [None]:
# up_train_text_tensors = torch.clone(train_text_tensors)[nonzero_indexs]
# up_train_text_mask = torch.clone(train_text_mask)[nonzero_indexs]
# up_train_statistic_tensors =  torch.clone(train_statistic_tensors)[nonzero_indexs]
# up_train_temporal_tensors = torch.clone(train_temporal_tensors)[nonzero_indexs]
# up_train_labels = torch.clone(train_labels)[nonzero_indexs]

# 4. Modeling

### Weighted MSE Loss Function

In [None]:
def WeightedMSELoss(predicted, labels):
    return torch.sum(F.mse_loss(predicted, labels, reduction='none') * labels) / predicted.size(0) # bigger penalty on bigger cascade

### Model Training: BERT (with fine-tuning) + MLP

In [None]:
class PopularityModel(nn.Module):
    
    def __init__(self, tweet_in_dim=768, tweet_to_dim=128,
                 news_title_in_dim=768, news_title_to_dim=64,
                 news_text_in_dim=768, news_text_to_dim=128,
                 other_dimension = 42):
        super(PopularityModel, self).__init__()

        self.MLP_INPUT_DIM = tweet_to_dim + news_title_to_dim + news_text_to_dim + other_dimension
        
        self.tweet_linear = nn.Linear(tweet_in_dim, tweet_to_dim)
        self.news_title_linear = nn.Linear(news_title_in_dim, news_title_to_dim)
        self.news_text_linear = nn.Linear(news_text_in_dim, news_text_to_dim)
        
        self.linear1 = nn.Linear(self.MLP_INPUT_DIM, int(self.MLP_INPUT_DIM/2))
        self.linear2 = nn.Linear(int(self.MLP_INPUT_DIM/2), int(self.MLP_INPUT_DIM/4))
        self.linear3 = nn.Linear(int(self.MLP_INPUT_DIM/4), int(self.MLP_INPUT_DIM/8))
        self.linear4 = nn.Linear(int(self.MLP_INPUT_DIM/8), 1)
        
    def forward(self, tweet_embedded, news_title_embedded, news_text_embedded, other_features):
        
        tweet_downsampled = F.relu(self.tweet_linear(tweet_embedded))
        news_title_downsampled = F.relu(self.news_title_linear(news_title_embedded))
        news_text_downsampled = F.relu(self.news_text_linear(news_text_embedded))
        
        # text concatenated with statistical features, temporal features
        y = torch.cat((tweet_downsampled, news_title_downsampled, news_text_downsampled, other_features), 1)
        
        # MLP
        y = F.relu(self.linear1(y))
        y = F.relu(self.linear2(y))
        y = F.relu(self.linear3(y))
        y = F.relu(self.linear4(y))
        
        return y

# 5. Evaluation

Check label distribution:

In [None]:
sorted_test_label_indexs = torch.argsort(test_labels.squeeze(0), dim=0, descending = True).squeeze(1)

test_labels[sorted_test_label_indexs[:int(test_labels.size(0) * 0.15)]]

#### Metrics for Ranking
related article: [Evaluation Metrics for Ranking problems: Introduction and Examples](https://queirozf.com/entries/evaluation-metrics-for-ranking-problems-introduction-and-examples)

* Accuracy@K (Hit Rate)
* NDCG

`note`: Here we define K = 1%, 5%, 10%, 15%

**Accuracy@K(Hit Rate):**  

Accuracy@K  
= precision@K  
= true positives@K / (true positives@K + false positives@K)  
= recall@K  
= true positives@K / (true positives@K + false negative@K)  

In [None]:
def accuracy_at_k(predicted, labels, k = 10):
    
    # check whether both sizes are identical
    assert predicted.size(0) == labels.size(0)
    
    # sort the values in descending order and gets the indexs
    sorted_predicted_index = torch.argsort(predicted, descending = True)
    sorted_label_index = torch.argsort(labels, descending = True)
    
    k_number = max(int(predicted.size(0) * k / 100), 1)
    
    topk_predicted_index = sorted_predicted_index[:k_number]
    topk_label_index = sorted_label_index[:k_number]
    
    hit_count = 0
    for p in topk_predicted_index:
        if p in topk_label_index:
            hit_count += 1
            
    accuracy = hit_count/k_number
            
    return (accuracy, hit_count, k_number)

### 。Candidates of Comparison

1. our model (fitted)
2. historical average
3. random - our model without fitting

#### 。HA

In [None]:
train_labels.mean().item()

In [None]:
ha_predicted = torch.ones( (test_labels.shape[0],) ) * train_labels.mean().item()
ha_predicted = ha_predicted.to(device)

testset_size = test_labels.size(0)

model_mae_scores = F.l1_loss(ha_predicted, test_labels)
model_mse_scores = F.mse_loss(ha_predicted, test_labels)

hit_rate_top1p = accuracy_at_k(ha_predicted, test_labels, 1)
hit_rate_top5p = accuracy_at_k(ha_predicted, test_labels, 5)
hit_rate_top10p = accuracy_at_k(ha_predicted, test_labels, 10)
hit_rate_top15p = accuracy_at_k(ha_predicted, test_labels, 15)

ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), ha_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), ha_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), ha_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), ha_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

print(f"MAE: {model_mae_scores.item()}")
print(f"MSE: {model_mse_scores.item()}")
print(f"Hit Rate@1%: {hit_rate_top1p}")
print(f"Hit Rate@5%: {hit_rate_top5p}")
print(f"Hit Rate@10%: {hit_rate_top10p}")
print(f"Hit Rate@15%: {hit_rate_top15p}")
print(f"NDCG@1%: {ndcg_score_1p}")
print(f"NDCG@5%: {ndcg_score_5p}")
print(f"NDCG@10%: {ndcg_score_10p}")
print(f"NDCG@15%: {ndcg_score_15p}")

#### 。HM (Historical Median)

In [None]:
train_labels.median().item()

In [None]:
hm_predicted = torch.ones( (test_labels.shape[0],) ) * train_labels.median().item()
hm_predicted = hm_predicted.to(device)

testset_size = test_labels.size(0)

model_mae_scores = F.l1_loss(hm_predicted, test_labels)
model_mse_scores = F.mse_loss(hm_predicted, test_labels)

hit_rate_top1p = accuracy_at_k(hm_predicted, test_labels, 1)
hit_rate_top5p = accuracy_at_k(hm_predicted, test_labels, 5)
hit_rate_top10p = accuracy_at_k(hm_predicted, test_labels, 10)
hit_rate_top15p = accuracy_at_k(hm_predicted, test_labels, 15)

ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), hm_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), hm_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), hm_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), hm_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

print(f"MAE: {model_mae_scores.item()}")
print(f"MSE: {model_mse_scores.item()}")
print(f"Hit Rate@1%: {hit_rate_top1p}")
print(f"Hit Rate@5%: {hit_rate_top5p}")
print(f"Hit Rate@10%: {hit_rate_top10p}")
print(f"Hit Rate@15%: {hit_rate_top15p}")
print(f"NDCG@1%: {ndcg_score_1p}")
print(f"NDCG@5%: {ndcg_score_5p}")
print(f"NDCG@10%: {ndcg_score_10p}")
print(f"NDCG@15%: {ndcg_score_15p}")

## 7. Automatic Training

In [None]:
LR_REGRESSION = 0.2e-3 # learning rate for linear regression
LR_BERT_TWEET = 1e-5 # learning rate for BERT (tweet)
LR_BERT_NEWS_TITLE = 1e-6 # learning rate for BERT (title)
LR_BERT_NEWS_TEXT = 1e-6 # learning rate for BERT (text)
EPOCH = 500
BATCH_SIZE = 720
EARLY_STOP_PATIENCE = 2

In [None]:
def train(regression_lr=LR_REGRESSION, bert_tweet_lr=LR_BERT_TWEET, bert_news_title_lr=LR_BERT_NEWS_TITLE, bert_news_text_lr=LR_BERT_NEWS_TEXT,
          max_epoch=EPOCH, batch_size=BATCH_SIZE, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None):
    
    if manual_seed:
        seed = manual_seed
    else:
        seed = torch.random.seed()
    
    torch.manual_seed(seed)
    popularity_model = PopularityModel(tweet_in_dim=768, tweet_to_dim=128,
                 news_title_in_dim=768, news_title_to_dim=64,
                 news_text_in_dim=768, news_text_to_dim=128,
                 other_dimension = 42).to(device)
    popularity_model.train()
    
    # BERT for tweet
    bert_model_tweet = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    # fine-tune only last layer of BERT
    for param in bert_model_tweet.parameters():
        param.requires_grad = False
    for p in bert_model_tweet.transformer.layer[len(bert_model_tweet.transformer.layer) - 1].parameters():
        p.requires_grad = True
        
    # BERT for news title
    bert_model_news_title = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    # fine-tune only last layer of BERT
    for param in bert_model_news_title.parameters():
        param.requires_grad = False
    for p in bert_model_news_title.transformer.layer[len(bert_model_news_title.transformer.layer) - 1].parameters():
        p.requires_grad = True
        
    # BERT for news text
    bert_model_news_text = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    # fine-tune only last layer of BERT
    for param in bert_model_news_text.parameters():
        param.requires_grad = False
    for p in bert_model_news_text.transformer.layer[len(bert_model_news_text.transformer.layer) - 1].parameters():
        p.requires_grad = True

    optimizer_regression = torch.optim.Adam(popularity_model.parameters(), lr = regression_lr)
    optimizer_bert_tweet = AdamW(bert_model_tweet.parameters(), lr=bert_tweet_lr)
    optimizer_bert_news_title = AdamW(bert_model_news_title.parameters(), lr=bert_news_title_lr)
    optimizer_bert_news_text = AdamW(bert_model_news_text.parameters(), lr=bert_news_text_lr)
    
    epoch_losses = []

    # cross validation for early stopping
    current_val_error = float('inf')
    val_error_inc_count = 0
    cv_losses = []
    
    for epoch in range(max_epoch):
    
        if verbose:
            print(epoch, end=",")

        batch_losses = []

        # batch training
        for i in range(0, train_labels.size(0), batch_size):

            optimizer_regression.zero_grad()
            optimizer_bert_tweet.zero_grad()
            optimizer_bert_news_title.zero_grad()
            optimizer_bert_news_text.zero_grad()

            END = (i + batch_size) if (i + batch_size) < train_labels.size(0) else train_labels.size(0)
        
            batch_tweet_features = train_text_tensors[i:END]
            batch_tweet_mask = train_text_mask[i:END]
            batch_news_title_features = train_news_title_tensors[i:END]
            batch_news_title_mask = train_news_title_mask[i:END]
            batch_news_text_features = train_news_text_tensors[i:END]
            batch_news_text_mask = train_news_text_mask[i:END]
            batch_numeric_features = torch.cat((train_numeric_tensors[i:END], train_news_numeric_tensors[i:END]), 1)
            batch_labels = train_labels[i:END]

            # forward: BERT embedding for tweet
            last_hidden_states = bert_model_tweet(batch_tweet_features, attention_mask=batch_tweet_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            tweet_embedded = last_hidden_states[0][:, 0, :]
            
            # forward: BERT embedding for news title
            last_hidden_states = bert_model_news_title(batch_news_title_features, attention_mask=batch_news_title_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            news_title_embedded = last_hidden_states[0][:, 0, :]
            
            # forward: BERT embedding for tweet
            last_hidden_states = bert_model_news_text(batch_news_text_features, attention_mask=batch_news_text_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            news_text_embedded = last_hidden_states[0][:, 0, :]

            # forward: Linear Regression
            predicted = popularity_model(tweet_embedded, news_title_embedded, news_text_embedded, batch_numeric_features)

            # compute loss (weighted mean squared error)
            loss = F.mse_loss(predicted, batch_labels, reduction='mean')

            # backward propagation
            loss.backward()
            optimizer_regression.step()
            optimizer_bert_tweet.step()
            optimizer_bert_news_title.step()
            optimizer_bert_news_text.step()

            batch_losses.append(loss)

        epoch_losses.append(torch.tensor(batch_losses).mean().item())
        
        with torch.no_grad():
        
            batch_losses = []
            for i in range(0, cv_labels.size(0), batch_size):

                END = (i + batch_size) if (i + batch_size) < cv_labels.size(0) else cv_labels.size(0)
                
                batch_tweet_features = cv_text_tensors[i:END]
                batch_tweet_mask = cv_text_mask[i:END]
                batch_news_title_features = cv_news_title_tensors[i:END]
                batch_news_title_mask = cv_news_title_mask[i:END]
                batch_news_text_features = cv_news_text_tensors[i:END]
                batch_news_text_mask = cv_news_text_mask[i:END]
                batch_numeric_features = torch.cat((cv_numeric_tensors[i:END], cv_news_numeric_tensors[i:END]), 1)
                batch_labels = cv_labels[i:END]

                # forward: BERT embedding for tweet
                last_hidden_states = bert_model_tweet(batch_tweet_features, attention_mask=batch_tweet_mask)
                # The embedded CLS token can be thought of as an embedding for the entire sentence!!
                tweet_embedded = last_hidden_states[0][:, 0, :]

                # forward: BERT embedding for news title
                last_hidden_states = bert_model_news_title(batch_news_title_features, attention_mask=batch_news_title_mask)
                # The embedded CLS token can be thought of as an embedding for the entire sentence!!
                news_title_embedded = last_hidden_states[0][:, 0, :]

                # forward: BERT embedding for tweet
                last_hidden_states = bert_model_news_text(batch_news_text_features, attention_mask=batch_news_text_mask)
                # The embedded CLS token can be thought of as an embedding for the entire sentence!!
                news_text_embedded = last_hidden_states[0][:, 0, :]

                # forward: Linear Regression
                predicted = popularity_model(tweet_embedded, news_title_embedded, news_text_embedded, batch_numeric_features)

                # compute loss (weighted mean squared error)
                loss = F.mse_loss(predicted, batch_labels, reduction='mean')

                batch_losses.append(loss)

            cv_error = torch.tensor(batch_losses).mean().item()
            cv_losses.append(cv_error)

            if cv_error >= current_val_error:
                val_error_inc_count += 1
                current_val_error = cv_error
                if val_error_inc_count >= early_stop_patience:
                    if verbose:
                        print(f"early stopping triggered! stopped at epoch {epoch}")
                    break
            else:
                val_error_inc_count = 0
                current_val_error = cv_error
    
    # evaluation
    with torch.no_grad():

        model_test_predicted = torch.zeros((test_labels.size(0),), dtype=torch.float).to(device)
        for i in range(0, test_labels.size(0), batch_size):

            END = (i + batch_size) if (i + batch_size) < test_labels.size(0) else test_labels.size(0)

            batch_tweet_features = test_text_tensors[i:END]
            batch_tweet_mask = test_text_mask[i:END]
            batch_news_title_features = test_news_title_tensors[i:END]
            batch_news_title_mask = test_news_title_mask[i:END]
            batch_news_text_features = test_news_text_tensors[i:END]
            batch_news_text_mask = test_news_text_mask[i:END]
            batch_numeric_features = torch.cat((test_numeric_tensors[i:END], test_news_numeric_tensors[i:END]), 1)
            batch_labels = test_labels[i:END]

            # forward: BERT embedding for tweet
            last_hidden_states = bert_model_tweet(batch_tweet_features, attention_mask=batch_tweet_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            tweet_embedded = last_hidden_states[0][:, 0, :]

            # forward: BERT embedding for news title
            last_hidden_states = bert_model_news_title(batch_news_title_features, attention_mask=batch_news_title_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            news_title_embedded = last_hidden_states[0][:, 0, :]

            # forward: BERT embedding for tweet
            last_hidden_states = bert_model_news_text(batch_news_text_features, attention_mask=batch_news_text_mask)
            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            news_text_embedded = last_hidden_states[0][:, 0, :]

            # forward: Linear Regression
            model_test_predicted[i:END] = popularity_model(tweet_embedded, news_title_embedded, news_text_embedded, batch_numeric_features).squeeze(1)

        testset_size = test_labels.size(0)

        model_mae_scores = F.l1_loss(model_test_predicted, test_labels)
        model_mse_scores = F.mse_loss(model_test_predicted, test_labels)

        hit_rate_top1p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 1)
        hit_rate_top5p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 5)
        hit_rate_top10p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 10)
        hit_rate_top15p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 15)

        ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
        ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
        ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
        ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

        if verbose:
            print(f"seed: {seed}")
            print(f"MAE: {model_mae_scores.item()}")
            print(f"MSE: {model_mse_scores.item()}")
            print(f"Hit Rate@1%: {hit_rate_top1p}")
            print(f"Hit Rate@5%: {hit_rate_top5p}")
            print(f"Hit Rate@10%: {hit_rate_top10p}")
            print(f"Hit Rate@15%: {hit_rate_top15p}")
            print(f"NDCG@1%: {ndcg_score_1p}")
            print(f"NDCG@5%: {ndcg_score_5p}")
            print(f"NDCG@10%: {ndcg_score_10p}")
            print(f"NDCG@15%: {ndcg_score_15p}")
            
            plt.plot(epoch_losses, label = 'training')
            plt.plot(cv_losses, label = 'validation')
            plt.xlabel('epoch'), plt.ylabel('MSE')
            plt.legend()
            plt.show()
            
        # clear useless CUDA memory
        popularity_model = None
        bert_model = None
        optimizer_regression = None
        optimizer_bert = None
        batch_text_features = None
        batch_text_mask = None
        batch_statistic_features = None
        batch_temporal_features = None
        batch_labels = None
        last_hidden_states = None
        embedded = None
        predicted = None
        loss = None

        torch.cuda.empty_cache()
        
        return {
            'seed': seed,
            'mae': model_mae_scores.item(),
            'mse': model_mse_scores.item(),
            'hr1p': hit_rate_top1p[0],
            'hr5p': hit_rate_top5p[0],
            'hr10p': hit_rate_top10p[0],
            'hr15p': hit_rate_top15p[0],
            'ndcg1p': ndcg_score_1p,
            'ndcg5p': ndcg_score_5p,
            'ndcg10p': ndcg_score_10p,
            'ndcg15p': ndcg_score_15p
        };

In [None]:
results = []
for i in range(10):
    print(f"Model {i+1}")
    res = train(regression_lr=LR_REGRESSION, bert_tweet_lr=LR_BERT_TWEET, bert_news_title_lr=LR_BERT_NEWS_TITLE, bert_news_text_lr=LR_BERT_NEWS_TEXT,
          max_epoch=EPOCH, batch_size=BATCH_SIZE, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None)
    res['score'] = res['hr1p'] + res['hr5p'] + res['hr10p'] + res['hr15p'] + res['ndcg1p'] + res['ndcg5p'] + res['ndcg10p'] + res['ndcg15p']
    results.append(res)

In [None]:
sorted_results = sorted(results, key = lambda k: k['score'], reverse=True)
sorted_results[:3]

In [None]:
s = "0.0018, 0.0018, 0.0043, 0.0038, 0.0044, 0.0026, 0.0018, 0.0031"
s.replace(",", "+")