# Cascade Network (Numeric Features + Graph Features)

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers as ppb
from transformers import AdamW

from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv

import matplotlib.pyplot as plt

import warnings
import os.path as path
import json
from datetime import datetime

import sklearn
import scipy

# print messages
warnings.filterwarnings('ignore')

# device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device type: {device.type}")

## 2. Import Dataset

In [None]:
DATASET_TRAIN = "./data/splitted/cascade_network_train.csv"
DATASET_CV = "./data/splitted/cascade_network_cv.csv"
DATASET_TEST = "./data/splitted/cascade_network_test.csv"

In [None]:
train_df = pd.read_csv(DATASET_TRAIN, header = 0)
cv_df = pd.read_csv(DATASET_CV, header = 0)
test_df = pd.read_csv(DATASET_TEST, header = 0)

print(f"Columns: {train_df.columns}")
print(f"train set size: {len(train_df)}")
print(f"cross validation set size: {len(cv_df)}")
print(f"test set size: {len(test_df)}")

## 3. Preprocess Data

#### Numeric Features

In [None]:
numeric_column_names = ['max_deg', 'avg_deg', 'min_deg', 'max_timediff', 'min_timediff', 'avg_timediff', 'node_number', 'edge_number']

In [None]:
train_numeric_tensors = torch.tensor(train_df[numeric_column_names].values, dtype=torch.float).to(device)
cv_numeric_tensors = torch.tensor(cv_df[numeric_column_names].values, dtype=torch.float).to(device)
test_numeric_tensors = torch.tensor(test_df[numeric_column_names].values, dtype=torch.float).to(device)

print(train_numeric_tensors.shape)
print(cv_numeric_tensors.shape)
print(test_numeric_tensors.shape)

#### Graph Features

In [None]:
train_h1_graphs = []
train_h2_graphs = []
train_h3_graphs = []

cv_h1_graphs = []
cv_h2_graphs = []
cv_h3_graphs = []

test_h1_graphs = []
test_h2_graphs = []
test_h3_graphs = []

for df_idx, df in enumerate([train_df, cv_df, test_df]):
    
    for row_idx, row in df.iterrows():
        
        graph_series = []
        
        h1_nodes = json.loads(row['h1_nodes'].replace("'", '"'))
        h1_edges = json.loads(row['h1_edges'].replace("'", '"'))
        h2_nodes = json.loads(row['h2_nodes'].replace("'", '"'))
        h2_edges = json.loads(row['h2_edges'].replace("'", '"'))
        h3_nodes = json.loads(row['h3_nodes'].replace("'", '"'))
        h3_edges = json.loads(row['h3_edges'].replace("'", '"'))
        
        for (nodes, edges) in [(h1_nodes, h1_edges), (h2_nodes, h2_edges), (h3_nodes, h3_edges)]:
    
            # do re-index & add nodes
            node_reindex_map = {}
            node_features = []
            current_id = 0
            for node_id in list(nodes.keys()):
                node_features.append(nodes[node_id])
                node_reindex_map[int(node_id)] = current_id
                current_id += 1

            # add edges
            nodes_from = []
            nodes_to = []
            for edge in edges:
                # edge: [user A, user A's follower]
                nodes_from.append(node_reindex_map[edge[0]])
                nodes_to.append(node_reindex_map[edge[1]])

            x = torch.tensor(node_features, dtype=torch.float)
            edge_index = torch.tensor([nodes_from, nodes_to], dtype=torch.long)

            graph = Data(x=x, edge_index=edge_index).to(device)
            graph_series.append(graph)
    
    
        if df_idx == 0:
            train_h1_graphs.append(graph_series[0])
            train_h2_graphs.append(graph_series[1])
            train_h3_graphs.append(graph_series[2])
        elif df_idx == 1:
            cv_h1_graphs.append(graph_series[0])
            cv_h2_graphs.append(graph_series[1])
            cv_h3_graphs.append(graph_series[2])
        else:
            test_h1_graphs.append(graph_series[0])
            test_h2_graphs.append(graph_series[1])
            test_h3_graphs.append(graph_series[2])

In [None]:
BATCH_SIZE = 10000

train_h1_dataset = DataLoader(train_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
train_h2_dataset = DataLoader(train_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
train_h3_dataset = DataLoader(train_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

cv_h1_dataset = DataLoader(cv_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
cv_h2_dataset = DataLoader(cv_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
cv_h3_dataset = DataLoader(cv_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

test_h1_dataset = DataLoader(test_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
test_h2_dataset = DataLoader(test_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
test_h3_dataset = DataLoader(test_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

#### Label

In [None]:
train_labels = torch.tensor(train_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
cv_labels = torch.tensor(cv_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
test_labels = torch.tensor(test_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)

print(train_labels.shape)

## 4. Model

In [None]:
class PopularityModel(torch.nn.Module):
    
    def __init__(self, node_dimension=8, numeric_dimension=8):
        
        super(PopularityModel, self).__init__()
        
        self.node_dim = node_dimension
        self.numeric_dim = numeric_dimension
        
        self.mlp_dim = int(self.node_dim/2) + self.numeric_dim
        
        self.g1_gconv = GCNConv(self.node_dim, self.node_dim)
        self.g2_gconv = GCNConv(self.node_dim, self.node_dim)
        self.g3_gconv = GCNConv(self.node_dim, self.node_dim)
        
        self.rnn = nn.GRUCell(self.node_dim, int(self.node_dim/2))
        
        self.linear1 = nn.Linear(self.mlp_dim, int(self.mlp_dim/2))
        self.linear2 = nn.Linear(int(self.mlp_dim/2), int(self.mlp_dim/4))
        self.linear3 = nn.Linear(int(self.mlp_dim/4), 1)

    def forward(self, graph1, graph2, graph3, numeric_features):
        
        batch_size = graph1.num_graphs
        
        g1_x, g1_edge_index = graph1.x, graph1.edge_index
        g2_x, g2_edge_index = graph2.x, graph2.edge_index
        g3_x, g3_edge_index = graph3.x, graph3.edge_index
        
        g1_x = F.relu(self.g1_gconv(g1_x, g1_edge_index)) # shape: (all node_number) * node_dimension
        g1_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph1.batch == g_idx).squeeze(1)
            g1_x_pooled[g_idx] = torch.sum(g1_x[indexs], 0)
            
        g2_x = F.relu(self.g2_gconv(g2_x, g2_edge_index)) # shape: (all node_number) * node_dimension
        g2_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph2.batch == g_idx).squeeze(1)
            g2_x_pooled[g_idx] = torch.sum(g2_x[indexs], 0)
            
        g3_x = F.relu(self.g3_gconv(g3_x, g3_edge_index)) # shape: (all node_number) * node_dimension
        g3_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph3.batch == g_idx).squeeze(1)
            g3_x_pooled[g_idx] = torch.sum(g3_x[indexs], 0)
        
        hx = torch.randn((batch_size, int(self.node_dim/2) ), dtype=torch.float).to(device)
        hx = self.rnn(g1_x_pooled, hx)
        hx = self.rnn(g2_x_pooled, hx)
        hx = self.rnn(g3_x_pooled, hx)

        x = F.relu(self.linear1( torch.cat((hx, numeric_features), 1) ))
        x = F.relu(self.linear2(x))
        y = F.relu(self.linear3(x))

        return y

## 5. Evaluation

In [None]:
def accuracy_at_k(predicted, labels, k = 10):
    
    # check whether both sizes are identical
    assert predicted.size(0) == labels.size(0)
    
    # sort the values in descending order and gets the indexs
    sorted_predicted_index = torch.argsort(predicted, descending = True)
    sorted_label_index = torch.argsort(labels, descending = True)
    
    k_number = max(int(predicted.size(0) * k / 100), 1)
    
    topk_predicted_index = sorted_predicted_index[:k_number]
    topk_label_index = sorted_label_index[:k_number]
    
    hit_count = 0
    for p in topk_predicted_index:
        if p in topk_label_index:
            hit_count += 1
            
    accuracy = hit_count/k_number
            
    return (accuracy, hit_count, k_number)

## 6. Automatic Training

In [None]:
LR_REGRESSION = 5e-3
EPOCH = 10
EARLY_STOP_PATIENCE = 2

In [None]:
def train(lr_regression=LR_REGRESSION, max_epoch=EPOCH, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None):
    
    if manual_seed:
        seed = manual_seed
    else:
        seed = torch.random.seed()
    
    torch.manual_seed(seed)
    
    popularity_model = PopularityModel(node_dimension = 8).to(device)
    popularity_model.train()

    # optimizer instances
    optimizer_regression = torch.optim.Adam(popularity_model.parameters(), lr=lr_regression)
    
    train_losses = []

    # cross validation for early stopping
    current_val_error = float('inf')
    val_error_inc_count = 0
    cv_losses = []
    
    for epoch in range(max_epoch):
    
        if verbose:
            print(f"{epoch}", end=".")

        batch_losses = []
        
        train_h1_it = iter(train_h1_dataset)
        train_h2_it = iter(train_h2_dataset)
        train_h3_it = iter(train_h3_dataset)

        # training 
        for i in range(0, train_labels.size(0), BATCH_SIZE):
            
            if verbose:
                print(".", end="")

            optimizer_regression.zero_grad()
            
            END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < train_labels.size(0) else train_labels.size(0)
            
            graph_h1, graph_h2, graph_h3 = next(train_h1_it), next(train_h2_it), next(train_h3_it)
            graph_h1.to(device)
            graph_h2.to(device)
            graph_h3.to(device)
            batch_numeric_features = train_numeric_tensors[i:END]
            batch_labels = train_labels[i:END]

            # forward: GNN->RNN->LR
            predicted = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features)

            # compute loss (weighted mean squared error)
            loss = F.mse_loss(predicted, batch_labels, reduction='mean')
            # loss = WeightedMSELoss(predicted, batch_labels) # bigger penalty on bigger cascade

            # backward propagation
            loss.backward()
            optimizer_regression.step()

            batch_losses.append(loss)

        train_loss = torch.tensor(batch_losses).mean().item()
        
        if verbose:
            print(f"Loss: {train_loss:.4f}", end=",\n")
        
        if (epoch > 0) and (train_loss == train_losses[-1]):
            if verbose:
                print(f"early stopping triggered! stopped at epoch {epoch}")
                break
                
        train_losses.append(train_loss)

        # cross validation & early stopping
        with torch.no_grad():

            batch_losses = []
            
            cv_h1_it = iter(cv_h1_dataset)
            cv_h2_it = iter(cv_h2_dataset)
            cv_h3_it = iter(cv_h3_dataset)
            
            for i in range(0, cv_labels.size(0), BATCH_SIZE):
                
                END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < cv_labels.size(0) else cv_labels.size(0)
                
                graph_h1, graph_h2, graph_h3 = next(cv_h1_it), next(cv_h2_it), next(cv_h3_it)
                graph_h1.to(device)
                graph_h2.to(device)
                graph_h3.to(device)
                batch_numeric_features = cv_numeric_tensors[i:END]
                batch_labels = cv_labels[i:END]

                # forward: GNN->RNN->LR
                predicted = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features)

                # compute loss (weighted mean squared error)
                loss = F.mse_loss(predicted, batch_labels, reduction='mean')

                batch_losses.append(loss)

            cv_error = torch.tensor(batch_losses).mean().item()
            cv_losses.append(cv_error)

            if cv_error >= current_val_error:
                val_error_inc_count += 1
                current_val_error = cv_error
                if val_error_inc_count >= early_stop_patience:
                    if verbose:
                        print(f"early stopping triggered! stopped at epoch {epoch}")
                    break
            else:
                val_error_inc_count = 0
                current_val_error = cv_error
                
    with torch.no_grad():

        model_test_predicted = torch.zeros((test_labels.size(0),), dtype=torch.float).to(device)
        
        test_h1_it = iter(test_h1_dataset)
        test_h2_it = iter(test_h2_dataset)
        test_h3_it = iter(test_h3_dataset)
        for i in range(0, test_labels.size(0), BATCH_SIZE):

            graph_h1, graph_h2, graph_h3 = next(test_h1_it), next(test_h2_it), next(test_h3_it)
            graph_h1.to(device)
            graph_h2.to(device)
            graph_h3.to(device)
            
            END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < cv_labels.size(0) else cv_labels.size(0)
            batch_numeric_features = test_numeric_tensors[i:END]

            # forward: GNN->RNN->LR
            model_test_predicted[i:END] = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features).squeeze(1)

        testset_size = test_labels.size(0)

        model_mae_scores = F.l1_loss(model_test_predicted, test_labels)
        model_mse_scores = F.mse_loss(model_test_predicted, test_labels)

        hit_rate_top1p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 1)
        hit_rate_top5p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 5)
        hit_rate_top10p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 10)
        hit_rate_top15p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 15)

        ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
        ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
        ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
        ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

        if verbose:
            print(f"\nseed: {seed}")
            print(f"MAE: {model_mae_scores.item()}")
            print(f"MSE: {model_mse_scores.item()}")
            print(f"Hit Rate@1%: {hit_rate_top1p}")
            print(f"Hit Rate@5%: {hit_rate_top5p}")
            print(f"Hit Rate@10%: {hit_rate_top10p}")
            print(f"Hit Rate@15%: {hit_rate_top15p}")
            print(f"NDCG@1%: {ndcg_score_1p}")
            print(f"NDCG@5%: {ndcg_score_5p}")
            print(f"NDCG@10%: {ndcg_score_10p}")
            print(f"NDCG@15%: {ndcg_score_15p}")
            
            # plot loss curve
            plt.plot(train_losses, label = 'training')
            plt.plot(cv_losses, label = 'validation')
            plt.xlabel('epoch'), plt.ylabel('MSE')
            plt.legend()
            plt.show()
            
        # clear useless CUDA memory
        popularity_model = None
        optimizer_regression = None
        graph_h1, graph_h2, graph_h3 = None, None, None
        batch_labels = None
        predicted = None
        loss = None
        torch.cuda.empty_cache()
        
        return {
            'seed': seed,
            'mae': model_mae_scores.item(),
            'mse': model_mse_scores.item(),
            'hr1p': hit_rate_top1p[0],
            'hr5p': hit_rate_top5p[0],
            'hr10p': hit_rate_top10p[0],
            'hr15p': hit_rate_top15p[0],
            'ndcg1p': ndcg_score_1p,
            'ndcg5p': ndcg_score_5p,
            'ndcg10p': ndcg_score_10p,
            'ndcg15p': ndcg_score_15p,
            'param': {
                'lr_regression': lr_regression,
                'max_epoch': max_epoch,
                'early_stop_patience': early_stop_patience
            }
            
        };

In [None]:
train(lr_regression=2e-3, max_epoch=500, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=17643402734797685269)

In [None]:
MODELS_PER_SETTING = 10
results = []
for i in range(MODELS_PER_SETTING):
    print(f"model {i+1} - {datetime.now()}")

    res = train(lr_regression=2e-3, max_epoch=500, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None)
    res['number'] = i
    res['score'] = res['hr1p'] + res['hr5p'] + res['hr10p'] + res['hr15p'] + res['ndcg1p'] + res['ndcg5p'] + res['ndcg10p'] + res['ndcg15p']
    results.append(res)
    print(f"model {i+1} done - {datetime.now()}")