## 1. Import Packages

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers as ppb
from transformers import AdamW

import matplotlib.pyplot as plt

import warnings
import os.path as path

import sklearn
import scipy

# print messages
warnings.filterwarnings('ignore')

# device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device type: {device.type}")

## 2. Import Dataset

In [None]:
DATASET_TRAIN = "./data/splitted/user_profile_processed_train.csv"
DATASET_CV = "./data/splitted/user_profile_processed_cv.csv"
DATASET_TEST = "./data/splitted/user_profile_processed_test.csv"

In [None]:
train_df = pd.read_csv(DATASET_TRAIN, header = 0)
cv_df = pd.read_csv(DATASET_CV, header = 0)
test_df = pd.read_csv(DATASET_TEST, header = 0)

print(f"Columns: {train_df.columns}")
print(f"train set size: {len(train_df)}")
print(f"cross validation set size: {len(cv_df)}")
print(f"test set size: {len(test_df)}")

## 3. Preprocess Data

### Text Features

In [None]:
 # load pretrained tokenizer

# For DistilBERT:
tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_des_tokenized = train_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
cv_des_tokenized = cv_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
test_des_tokenized = test_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))

print(f"train tokenized shape: {train_des_tokenized.shape}")

In [None]:
total_length = 0

for i in train_des_tokenized.values:
    total_length += len(i)
for i in cv_des_tokenized.values:
    total_length += len(i)
for i in test_des_tokenized.values:
    total_length += len(i)
    
average_length = int(total_length / (train_des_tokenized.shape[0] + cv_des_tokenized.shape[0] + test_des_tokenized.shape[0]))
print(average_length)

In [None]:
# count how much proportion of self description is empty

empty_count = 0

for i in train_df['description'].values:
    if i == ' ':
        empty_count += 1
for i in cv_df['description'].values:
    if i == ' ':
        empty_count += 1
for i in test_df['description'].values:
    if i == ' ':
        empty_count += 1
        
print(empty_count)

In [None]:
# truncate description using average length
for i in range(len(train_des_tokenized)):
    train_des_tokenized[i] = train_des_tokenized[i][:average_length]
for i in range(len(cv_des_tokenized)):
    cv_des_tokenized[i] = cv_des_tokenized[i][:average_length]
for i in range(len(test_des_tokenized)):
    test_des_tokenized[i] = test_des_tokenized[i][:average_length]

In [None]:
# padding
train_des_padded = np.array([i + [0] * (average_length - len(i)) for i in train_des_tokenized.values])
cv_des_padded = np.array([i + [0] * (average_length - len(i)) for i in cv_des_tokenized.values])
test_des_padded = np.array([i + [0] * (average_length - len(i)) for i in test_des_tokenized.values])

print(f"train_padded: {train_des_padded.shape}")

In [None]:
# masking
train_attention_mask = np.where(train_des_padded != 0, 1, 0)
cv_attention_mask = np.where(cv_des_padded != 0, 1, 0)
test_attention_mask = np.where(test_des_padded != 0, 1, 0)

In [None]:
# put into GPU
train_text_tensors = torch.tensor(train_des_padded, dtype=torch.long).to(device)
cv_text_tensors = torch.tensor(cv_des_padded, dtype=torch.long).to(device)
test_text_tensors = torch.tensor(test_des_padded, dtype=torch.long).to(device)

train_text_mask = torch.tensor(train_attention_mask, dtype=torch.long).to(device)
cv_text_mask = torch.tensor(cv_attention_mask, dtype=torch.long).to(device)
test_text_mask = torch.tensor(test_attention_mask, dtype=torch.long).to(device)

### Numeric Features

In [None]:
numeric_column_names = ['followers_count', 'friends_count', 'listed_count', 'favorites_count', 'statuses_count']
boolean_column_names = ['protected', 'geo_enabled', 'verified']

Convert Boolean to 0 and 1:

In [None]:
d = {True: 1, False: 0}

for c in boolean_column_names:
    train_df[c] = train_df[c].map(d)
    cv_df[c] = cv_df[c].map(d)
    test_df[c] = test_df[c].map(d)

In [None]:
train_numeric_tensors = torch.tensor(train_df[boolean_column_names+numeric_column_names].values, dtype=torch.float).to(device)
cv_numeric_tensors = torch.tensor(cv_df[boolean_column_names+numeric_column_names].values, dtype=torch.float).to(device)
test_numeric_tensors = torch.tensor(test_df[boolean_column_names+numeric_column_names].values, dtype=torch.float).to(device)

print(train_numeric_tensors.shape)
print(cv_numeric_tensors.shape)
print(test_numeric_tensors.shape)

### Labels

In [None]:
train_labels = torch.tensor(train_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
cv_labels = torch.tensor(cv_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
test_labels = torch.tensor(test_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)

print(train_labels.shape)

## 4. Model

In [None]:
# downstream model
class PopularityModel(nn.Module):
    
    def __init__(self, text_in_dimension = 768, text_to_dimension = 32, numeric_in_dimension = 8):
        super(PopularityModel, self).__init__()
        
        self.text_linear = nn.Linear(text_in_dimension, text_to_dimension)
        
        self.mlp_input_dim = text_to_dimension + numeric_in_dimension
        
        self.linear1 = nn.Linear(self.mlp_input_dim, int(self.mlp_input_dim/2))
        self.linear2 = nn.Linear(int(self.mlp_input_dim/2), int(self.mlp_input_dim/4))
        self.linear3 = nn.Linear(int(self.mlp_input_dim/4), 1)
        
    def forward(self, embedded, numeric):
        
        text_downsampled = F.relu(self.text_linear(embedded))
        
        # concatenate text with numeric features
        y = torch.cat((text_downsampled, numeric), 1)
        
        y = F.relu(self.linear1(y))
        y = F.relu(self.linear2(y))
        y = F.relu(self.linear3(y))
        
        return y

## 5. Evaluation

### Metrics for Ranking

* Accuracy@K (Hit Rate)
* NDCG@K

`note`: Here we define K = 1%, 5%, 10%, 15%

In [None]:
def accuracy_at_k(predicted, labels, k = 10):
    
    # check whether both sizes are identical
    assert predicted.size(0) == labels.size(0)
    
    # sort the values in descending order and gets the indexs
    sorted_predicted_index = torch.argsort(predicted, descending = True)
    sorted_label_index = torch.argsort(labels, descending = True)
    
    k_number = max(int(predicted.size(0) * k / 100), 1)
    
    topk_predicted_index = sorted_predicted_index[:k_number]
    topk_label_index = sorted_label_index[:k_number]
    
    hit_count = 0
    for p in topk_predicted_index:
        if p in topk_label_index:
            hit_count += 1
            
    accuracy = hit_count/k_number
            
    return (accuracy, hit_count, k_number)

## 6. Automatic training

In [None]:
LR_BERT = 1e-5
LR_REGRESSION = 1e-3
EPOCH = 10000
BATCH_SIZE = 4800
EARLY_STOP_PATIENCE = 3

In [None]:
def train(lr_regression=LR_REGRESSION, lr_bert=LR_BERT, max_epoch=EPOCH, batch_size=BATCH_SIZE, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None):
    
    if manual_seed:
        seed = manual_seed
    else:
        seed = torch.random.seed()
    
    torch.manual_seed(seed)
    
    # downstream model
    popularity_model = PopularityModel(text_in_dimension = 768, text_to_dimension = 768, numeric_in_dimension = 8).to(device)
    popularity_model.train()

    # BERT
    bert_model = ppb.DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)

    # fine-tune only last layer of BERT
    for param in bert_model.parameters():
        param.requires_grad = False
    for p in bert_model.transformer.layer[len(bert_model.transformer.layer) - 1].parameters():
        p.requires_grad = True

    # optimizer instances
    optimizer_bert = AdamW(bert_model.parameters(), lr = lr_bert)
    optimizer_regression = torch.optim.Adam(popularity_model.parameters(), lr = lr_regression)
    
    train_losses = []

    # cross validation for early stopping
    current_val_error = float('inf')
    val_error_inc_count = 0
    cv_losses = []
    
    for epoch in range(max_epoch):
    
        if verbose:
            print(f"{epoch}", end=".")

        batch_losses = []

        # batch training
        for i in range(0, train_labels.size(0), batch_size):
            
            if verbose and i%(4*batch_size) == 0:
                print(".", end="")

            optimizer_regression.zero_grad()
            optimizer_bert.zero_grad()

            END = (i + batch_size) if (i + batch_size) < train_labels.size(0) else train_labels.size(0)
            
            batch_text_features = train_text_tensors[i:END]
            batch_text_mask = train_text_mask[i:END]
            batch_numeric_features = train_numeric_tensors[i:END]
            batch_labels = train_labels[i:END]

            # forward: BERT embedding
            last_hidden_states = bert_model(batch_text_features, attention_mask=batch_text_mask)

            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            embedded = last_hidden_states[0][:, 0, :]

            # forward: Linear Regression
            predicted = popularity_model(embedded, batch_numeric_features)

            # compute loss (weighted mean squared error)
            loss = F.mse_loss(predicted, batch_labels, reduction='mean')
            # loss = WeightedMSELoss(predicted, batch_labels) # bigger penalty on bigger cascade

            # backward propagation
            loss.backward()
            optimizer_regression.step()
            optimizer_bert.step()

            batch_losses.append(loss)

        train_losses.append(torch.tensor(batch_losses).mean().item())

        # cross validation & early stopping
        with torch.no_grad():

            batch_losses = []
            for i in range(0, cv_labels.size(0), batch_size):

                END = (i + batch_size) if (i + batch_size) < cv_labels.size(0) else cv_labels.size(0)

                batch_text_features = cv_text_tensors[i:END]
                batch_text_mask = cv_text_mask[i:END]
                batch_numeric_features = cv_numeric_tensors[i:END]
                batch_labels = cv_labels[i:END]

                # forward: BERT embedding
                last_hidden_states = bert_model(batch_text_features, attention_mask=batch_text_mask)

                # The embedded CLS token can be thought of as an embedding for the entire sentence!!
                embedded = last_hidden_states[0][:, 0, :]

                # forward: Linear Regression
                predicted = popularity_model(embedded, batch_numeric_features)

                # compute loss (weighted mean squared error)
                loss = F.mse_loss(predicted, batch_labels, reduction='mean')
                # loss = WeightedMSELoss(predicted, batch_labels) # bigger penalty on bigger cascade

                batch_losses.append(loss)

            cv_error = torch.tensor(batch_losses).mean().item()
            cv_losses.append(cv_error)

            if cv_error >= current_val_error:
                val_error_inc_count += 1
                current_val_error = cv_error
                if val_error_inc_count >= early_stop_patience:
                    if verbose:
                        print(f"early stopping triggered! stopped at epoch {epoch}")
                    break
            else:
                val_error_inc_count = 0
                current_val_error = cv_error
                
    with torch.no_grad():

        model_test_predicted = torch.zeros((test_labels.size(0),), dtype=torch.float).to(device)
        for i in range(0, test_labels.size(0), batch_size):

            END = (i + batch_size) if (i + batch_size) < test_labels.size(0) else test_labels.size(0)

            batch_text_features = test_text_tensors[i:END]
            batch_text_mask = test_text_mask[i:END]
            batch_numeric_features = test_numeric_tensors[i:END]
            batch_labels = test_labels[i:END].unsqueeze(1)

            # forward: BERT embedding
            last_hidden_states = bert_model(batch_text_features, attention_mask=batch_text_mask)

            # The embedded CLS token can be thought of as an embedding for the entire sentence!!
            embedded = last_hidden_states[0][:, 0, :]

            # forward: Linear Regression
            model_test_predicted[i:END] = popularity_model(embedded, batch_numeric_features).squeeze(1)

        testset_size = test_labels.size(0)

        model_mae_scores = F.l1_loss(model_test_predicted, test_labels)
        model_mse_scores = F.mse_loss(model_test_predicted, test_labels)

        hit_rate_top1p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 1)
        hit_rate_top5p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 5)
        hit_rate_top10p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 10)
        hit_rate_top15p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 15)

        ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
        ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
        ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
        ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

        if verbose:
            print(f"seed: {seed}")
            print(f"MAE: {model_mae_scores.item()}")
            print(f"MSE: {model_mse_scores.item()}")
            print(f"Hit Rate@1%: {hit_rate_top1p}")
            print(f"Hit Rate@5%: {hit_rate_top5p}")
            print(f"Hit Rate@10%: {hit_rate_top10p}")
            print(f"Hit Rate@15%: {hit_rate_top15p}")
            print(f"NDCG@1%: {ndcg_score_1p}")
            print(f"NDCG@5%: {ndcg_score_5p}")
            print(f"NDCG@10%: {ndcg_score_10p}")
            print(f"NDCG@15%: {ndcg_score_15p}")
            
            # plot loss curve
            plt.plot(train_losses, label = 'training')
            plt.plot(cv_losses, label = 'validation')
            plt.xlabel('epoch'), plt.ylabel('MSE')
            plt.legend()
            plt.show()
            
        # clear useless CUDA memory
        torch.cuda.empty_cache()
        popularity_model = None
        bert_model = None
        optimizer_regression = None
        optimizer_bert = None
        batch_text_features = None
        batch_text_mask = None
        batch_numeric_features = None
        batch_labels = None
        last_hidden_states = None
        embedded = None
        predicted = None
        loss = None
        
        return {
            'seed': seed,
            'mae': model_mae_scores.item(),
            'mse': model_mse_scores.item(),
            'hr1p': hit_rate_top1p[0],
            'hr5p': hit_rate_top5p[0],
            'hr10p': hit_rate_top10p[0],
            'hr15p': hit_rate_top15p[0],
            'ndcg1p': ndcg_score_1p,
            'ndcg5p': ndcg_score_5p,
            'ndcg10p': ndcg_score_10p,
            'ndcg15p': ndcg_score_15p,
            'train_losses': train_losses,
            'cv_losses': cv_losses
        };

In [None]:
results = []
for i in range(2):
    print(f"No.{i+1} model:")

    res = train(lr_regression=LR_REGRESSION, lr_bert=LR_BERT, max_epoch=EPOCH, batch_size = BATCH_SIZE, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True)
    res['number'] = i
    res['score'] = res['hr1p'] + res['hr5p'] + res['hr10p'] + res['hr15p'] + res['ndcg1p'] + res['ndcg5p'] + res['ndcg10p'] + res['ndcg15p']
    results.append(res)

In [None]:
sorted_results = sorted(results, key = lambda k: k['score'], reverse=True)
sorted_results[:3]