# Cacsade Network + User Profile + User Timeline + Source Tweet

## 1. Import Packages

In [None]:
import numpy as np
import pandas as pd
import scipy
import warnings
import os.path as path
import json
from datetime import datetime

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

# Transformer / BERT
import transformers as ppb
from transformers import AdamW

# plotting
import matplotlib.pyplot as plt

# Torch-Geometric
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv

# print messages
warnings.filterwarnings('ignore')

# device for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"device type: {device.type}")

## 2. Import Dataset

In [None]:
DATASET_NEWS = "./data/all/news.csv"

news_df = pd.read_csv(DATASET_NEWS, header = 0)
print(f"Columns: {news_df.columns}")
print(f"news set size: {len(news_df)}")

In [None]:
ST_DATASET_TRAIN = "./data/splitted/source_tweet_train.csv"
ST_DATASET_CV = "./data/splitted/source_tweet_cv.csv"
ST_DATASET_TEST = "./data/splitted/source_tweet_test.csv"

st_train_df = pd.read_csv(ST_DATASET_TRAIN, header = 0)
st_cv_df = pd.read_csv(ST_DATASET_CV, header = 0)
st_test_df = pd.read_csv(ST_DATASET_TEST, header = 0)

print(f"Columns: {st_train_df.columns}")
print(f"train set size: {len(st_train_df)}")
print(f"cross validation set size: {len(st_cv_df)}")
print(f"test set size: {len(st_test_df)}")

In [None]:
UP_DATASET_TRAIN = "./data/splitted/user_profile_processed_train.csv"
UP_DATASET_CV = "./data/splitted/user_profile_processed_cv.csv"
UP_DATASET_TEST = "./data/splitted/user_profile_processed_test.csv"

up_train_df = pd.read_csv(UP_DATASET_TRAIN, header = 0)
up_cv_df = pd.read_csv(UP_DATASET_CV, header = 0)
up_test_df = pd.read_csv(UP_DATASET_TEST, header = 0)

print(f"Columns: {up_train_df.columns}")
print(f"train set size: {len(up_train_df)}")
print(f"cross validation set size: {len(up_cv_df)}")
print(f"test set size: {len(up_test_df)}")

In [None]:
UT_DATASET_TRAIN = "./data/splitted/user_timeline_processed_train.csv"
UT_DATASET_CV = "./data/splitted/user_timeline_processed_cv.csv"
UT_DATASET_TEST = "./data/splitted/user_timeline_processed_test.csv"

ut_train_df = pd.read_csv(UT_DATASET_TRAIN, header = 0)
ut_cv_df = pd.read_csv(UT_DATASET_CV, header = 0)
ut_test_df = pd.read_csv(UT_DATASET_TEST, header = 0)

print(f"Columns: {ut_train_df.columns}")
print(f"train set size: {len(ut_train_df)}")
print(f"cross validation set size: {len(ut_cv_df)}")
print(f"test set size: {len(ut_test_df)}")

In [None]:
CN_DATASET_TRAIN = "./data/splitted/cascade_network_train.csv"
CN_DATASET_CV = "./data/splitted/cascade_network_cv.csv"
CN_DATASET_TEST = "./data/splitted/cascade_network_test.csv"

cn_train_df = pd.read_csv(CN_DATASET_TRAIN, header = 0)
cn_cv_df = pd.read_csv(CN_DATASET_CV, header = 0)
cn_test_df = pd.read_csv(CN_DATASET_TEST, header = 0)

print(f"Columns: {cn_train_df.columns}")
print(f"train set size: {len(cn_train_df)}")
print(f"cross validation set size: {len(cn_cv_df)}")
print(f"test set size: {len(cn_test_df)}")

## 3. Preprocess Data

In [None]:
# load pretrained tokenizer

# For DistilBERT:
tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

## News

In [None]:
news_title_tokenized = news_df['title'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
print(f"news title tokenized shape: {news_title_tokenized.shape}")

news_text_tokenized = news_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
print(f"news test tokenized shape: {news_text_tokenized.shape}")

In [None]:
NEWS_TITLE_LENGTH = 20
NEWS_TEXT_LENGTH = 64

# truncate news title and text 
for i in range(len(news_title_tokenized)):
    news_title_tokenized[i] = news_title_tokenized[i][:NEWS_TITLE_LENGTH]
for i in range(len(news_text_tokenized)):
    news_text_tokenized[i] = news_text_tokenized[i][:NEWS_TEXT_LENGTH]

In [None]:
# padding
news_title_padded = np.array([i + [0] * (NEWS_TITLE_LENGTH - len(i)) for i in news_title_tokenized.values])
news_text_padded = np.array([i + [0] * (NEWS_TEXT_LENGTH - len(i)) for i in news_text_tokenized.values])

print(f"news_title_padded: {news_title_padded.shape}")
print(f"news_text_padded: {news_text_padded.shape}")

In [None]:
# attention mask
news_title_attention_mask = np.where(news_title_padded != 0, 1, 0)
news_text_attention_mask = np.where(news_text_padded != 0, 1, 0)

In [None]:
NEWS_NUMERIC_COLUMNS = ['img_count']

# news text
train_news_text_tensors = []
cv_news_text_tensors = []
test_news_text_tensors = []
train_news_text_mask = []
cv_news_text_mask = []
test_news_text_mask = []

# news title
train_news_title_tensors = []
cv_news_title_tensors = []
test_news_title_tensors = []
train_news_title_mask = []
cv_news_title_mask = []
test_news_title_mask = []

# news numeric
train_news_numeric_tensors = []
cv_news_numeric_tensors = []
test_news_numeric_tensors = []

for idx, row in st_train_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    train_news_text_tensors.append(news_text_padded[news_row_id])
    train_news_text_mask.append(news_text_padded[news_row_id])
    train_news_title_tensors.append(news_title_padded[news_row_id])
    train_news_title_mask.append(news_title_padded[news_row_id])
    
    train_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in st_cv_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    cv_news_text_tensors.append(news_text_padded[news_row_id])
    cv_news_text_mask.append(news_text_padded[news_row_id])
    cv_news_title_tensors.append(news_title_padded[news_row_id])
    cv_news_title_mask.append(news_title_padded[news_row_id])
    
    cv_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in st_test_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    test_news_text_tensors.append(news_text_padded[news_row_id])
    test_news_text_mask.append(news_text_padded[news_row_id])
    test_news_title_tensors.append(news_title_padded[news_row_id])
    test_news_title_mask.append(news_title_padded[news_row_id])
    
    test_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
train_news_text_tensors = torch.tensor(train_news_text_tensors, dtype=torch.long).to(device)
cv_news_text_tensors = torch.tensor(cv_news_text_tensors, dtype=torch.long).to(device)
test_news_text_tensors = torch.tensor(test_news_text_tensors, dtype=torch.long).to(device)
train_news_text_mask = torch.tensor(train_news_text_mask, dtype=torch.long).to(device)
cv_news_text_mask = torch.tensor(cv_news_text_mask, dtype=torch.long).to(device)
test_news_text_mask = torch.tensor(test_news_text_mask, dtype=torch.long).to(device)

train_news_title_tensors = torch.tensor(train_news_title_tensors, dtype=torch.long).to(device)
cv_news_title_tensors = torch.tensor(cv_news_title_tensors, dtype=torch.long).to(device)
test_news_title_tensors = torch.tensor(test_news_title_tensors, dtype=torch.long).to(device)
train_news_title_mask = torch.tensor(train_news_title_mask, dtype=torch.long).to(device)
cv_news_title_mask = torch.tensor(cv_news_title_mask, dtype=torch.long).to(device)
test_news_title_mask = torch.tensor(test_news_title_mask, dtype=torch.long).to(device)

train_news_numeric_tensors = torch.tensor(train_news_numeric_tensors, dtype=torch.long).to(device)
cv_news_numeric_tensors = torch.tensor(cv_news_numeric_tensors, dtype=torch.long).to(device)
test_news_numeric_tensors = torch.tensor(test_news_numeric_tensors, dtype=torch.long).to(device)

## Source Tweet (Text)

In [None]:
#tokenize
post_train_tokenized = st_train_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
post_cv_tokenized = st_cv_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
post_test_tokenized = st_test_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))

print(f"train tokenized shape: {post_train_tokenized.shape}")

In [None]:
lengths = []
for i in post_train_tokenized.values:
    lengths.append(len(i))
for i in post_cv_tokenized.values:
    lengths.append(len(i))
for i in post_test_tokenized.values:
    lengths.append(len(i))
    
median_length = np.median(np.array(lengths))
print(median_length)

In [None]:
# we set a fixed text length, which is between max sentence length and average length
TEXT_LENGTH = int(median_length/2)

# truncate description using average length
for i in range(len(post_train_tokenized)):
    post_train_tokenized[i] = post_train_tokenized[i][:TEXT_LENGTH]
for i in range(len(post_cv_tokenized)):
    post_cv_tokenized[i] = post_cv_tokenized[i][:TEXT_LENGTH]
for i in range(len(post_test_tokenized)):
    post_test_tokenized[i] = post_test_tokenized[i][:TEXT_LENGTH]

In [None]:
# padding
post_train_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in post_train_tokenized.values])
post_cv_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in post_cv_tokenized.values])
post_test_padded = np.array([i + [0] * (TEXT_LENGTH - len(i)) for i in post_test_tokenized.values])

print(f"train_padded: {post_train_padded.shape}")

In [None]:
# masking
post_train_attention_mask = np.where(post_train_padded != 0, 1, 0)
post_cv_attention_mask = np.where(post_cv_padded != 0, 1, 0)
post_test_attention_mask = np.where(post_test_padded != 0, 1, 0)

In [None]:
# Put into GPU
train_post_tensors = torch.tensor(post_train_padded, dtype=torch.long).to(device)
cv_post_tensors = torch.tensor(post_cv_padded, dtype=torch.long).to(device)
test_post_tensors = torch.tensor(post_test_padded, dtype=torch.long).to(device)

train_post_mask = torch.tensor(post_train_attention_mask, dtype=torch.long).to(device)
cv_post_mask = torch.tensor(post_cv_attention_mask, dtype=torch.long).to(device)
test_post_mask = torch.tensor(post_test_attention_mask, dtype=torch.long).to(device)

### User Profile (Text)

In [None]:
train_des_tokenized = train_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
cv_des_tokenized = cv_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
test_des_tokenized = test_df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))

In [None]:
total_length = 0

for i in train_des_tokenized.values:
    total_length += len(i)
for i in cv_des_tokenized.values:
    total_length += len(i)
for i in test_des_tokenized.values:
    total_length += len(i)
    
average_length = int(total_length / (train_des_tokenized.shape[0] + cv_des_tokenized.shape[0] + test_des_tokenized.shape[0]))

In [None]:
# truncate description using average length
for i in range(len(train_des_tokenized)):
    train_des_tokenized[i] = train_des_tokenized[i][:average_length]
for i in range(len(cv_des_tokenized)):
    cv_des_tokenized[i] = cv_des_tokenized[i][:average_length]
for i in range(len(test_des_tokenized)):
    test_des_tokenized[i] = test_des_tokenized[i][:average_length]

In [None]:
train_des_padded = np.array([i + [0] * (average_length - len(i)) for i in train_des_tokenized.values])
cv_des_padded = np.array([i + [0] * (average_length - len(i)) for i in cv_des_tokenized.values])
test_des_padded = np.array([i + [0] * (average_length - len(i)) for i in test_des_tokenized.values])

print(f"train_padded: {train_des_padded.shape}")

In [None]:
 # masking
train_attention_mask = np.where(train_des_padded != 0, 1, 0)
cv_attention_mask = np.where(cv_des_padded != 0, 1, 0)
test_attention_mask = np.where(test_des_padded != 0, 1, 0)

In [None]:
# put into GPU
train_text_tensors = torch.tensor(train_des_padded, dtype=torch.long).to(device)
cv_text_tensors = torch.tensor(cv_des_padded, dtype=torch.long).to(device)
test_text_tensors = torch.tensor(test_des_padded, dtype=torch.long).to(device)

train_text_mask = torch.tensor(train_attention_mask, dtype=torch.long).to(device)
cv_text_mask = torch.tensor(cv_attention_mask, dtype=torch.long).to(device)
test_text_mask = torch.tensor(test_attention_mask, dtype=torch.long).to(device)

### Source Tweet (Numeric)

In [None]:
NEWS_NUMERIC_COLUMNS = ['img_count']

# news numeric
train_news_numeric_tensors = []
cv_news_numeric_tensors = []
test_news_numeric_tensors = []

for idx, row in st_train_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    train_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in st_cv_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    cv_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    
for idx, row in st_test_df.iterrows():
    news_id = row['news_id']
    news_row_id = news_df[news_df['id'] == news_id].index[0]
    
    test_news_numeric_tensors.append(news_df[NEWS_NUMERIC_COLUMNS].iloc[news_row_id].values)
    

train_news_numeric_tensors = torch.tensor(train_news_numeric_tensors, dtype=torch.long).to(device)
cv_news_numeric_tensors = torch.tensor(cv_news_numeric_tensors, dtype=torch.long).to(device)
test_news_numeric_tensors = torch.tensor(test_news_numeric_tensors, dtype=torch.long).to(device)

print(train_news_numeric_tensors.shape)

In [None]:
statistic_columns = ['user_count', 'tag_count', 'symbol_count', 'url_count', 'sentence_count']

wds = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
temporal_columns = [f"h_{h:02}" for h in range(0,24)] + [f"wday_{wd}" for wd in wds] + ['is_holiday']

sentiment_columns = ['avg_sentiment_score', 'sentiment_ratio', 'pos_count', 'neg_count']

post_numeric_columns = statistic_columns + temporal_columns + sentiment_columns
print(post_numeric_columns)

train_post_numeric_tensors = torch.tensor(st_train_df[post_numeric_columns].values, dtype=torch.float).to(device)
cv_post_numeric_tensors = torch.tensor(st_cv_df[post_numeric_columns].values, dtype=torch.float).to(device)
test_post_numeric_tensors = torch.tensor(st_test_df[post_numeric_columns].values, dtype=torch.float).to(device)

print(train_post_numeric_tensors.shape)
print(cv_post_numeric_tensors.shape)
print(test_post_numeric_tensors.shape)

#### User Profile (Numeric)

In [None]:
up_numeric_columns = ['followers_count', 'friends_count', 'listed_count', 'favorites_count', 'statuses_count']
up_boolean_columns = ['protected', 'geo_enabled', 'verified']

# convert boolean values to numeric values
d = {True: 1, False: 0}
for c in up_boolean_columns:
    up_train_df[c] = up_train_df[c].map(d)
    up_cv_df[c] = up_cv_df[c].map(d)
    up_test_df[c] = up_test_df[c].map(d)

# create tensors in GPU
train_up_numeric_tensors = torch.tensor(up_train_df[up_boolean_columns+up_numeric_columns].values, dtype=torch.float).to(device)
cv_up_numeric_tensors = torch.tensor(up_cv_df[up_boolean_columns+up_numeric_columns].values, dtype=torch.float).to(device)
test_up_numeric_tensors = torch.tensor(up_test_df[up_boolean_columns+up_numeric_columns].values, dtype=torch.float).to(device)

print(train_up_numeric_tensors.shape)
print(cv_up_numeric_tensors.shape)
print(test_up_numeric_tensors.shape)

### User Timeline (Text)

In [None]:
train_text_tokenized = train_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
cv_text_tokenized = cv_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))
test_text_tokenized = test_df['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens = True))

In [None]:
 # truncate description using average length
for i in range(len(train_text_tokenized)):
    train_text_tokenized[i] = train_text_tokenized[i][:average_length]
for i in range(len(cv_text_tokenized)):
    cv_text_tokenized[i] = cv_text_tokenized[i][:average_length]
for i in range(len(test_text_tokenized)):
    test_text_tokenized[i] = test_text_tokenized[i][:average_length]

In [None]:
# padding
train_text_padded = np.array([i + [0] * (average_length - len(i)) for i in train_text_tokenized.values])
cv_text_padded = np.array([i + [0] * (average_length - len(i)) for i in cv_text_tokenized.values])
test_text_padded = np.array([i + [0] * (average_length - len(i)) for i in test_text_tokenized.values])

print(f"train_padded: {train_text_padded.shape}")

In [None]:
 # masking
train_attention_mask = np.where(train_text_padded != 0, 1, 0)
cv_attention_mask = np.where(cv_text_padded != 0, 1, 0)
test_attention_mask = np.where(test_text_padded != 0, 1, 0)

In [None]:
# put into GPU
train_text_tensors = torch.tensor(train_text_padded, dtype=torch.long).to(device)
cv_text_tensors = torch.tensor(cv_text_padded, dtype=torch.long).to(device)
test_text_tensors = torch.tensor(test_text_padded, dtype=torch.long).to(device)

train_text_mask = torch.tensor(train_attention_mask, dtype=torch.long).to(device)
cv_text_mask = torch.tensor(cv_attention_mask, dtype=torch.long).to(device)
test_text_mask = torch.tensor(test_attention_mask, dtype=torch.long).to(device)

#### User Timeline (Numeric)

In [None]:
ut_numeric_columns = ['avg_rt_count', 'avg_favorite_count', 'sensitive_ratio',
                   'sentence_count', 'avg_sentiment_score', 'sentiment_ratio', 'pos_count', 'neg_count']

train_ut_numeric_tensors = torch.tensor(ut_train_df[ut_numeric_columns].values, dtype=torch.float).to(device)
cv_ut_numeric_tensors = torch.tensor(ut_cv_df[ut_numeric_columns].values, dtype=torch.float).to(device)
test_ut_numeric_tensors = torch.tensor(ut_test_df[ut_numeric_columns].values, dtype=torch.float).to(device)

print(train_ut_numeric_tensors.shape)
print(cv_ut_numeric_tensors.shape)
print(test_ut_numeric_tensors.shape)

#### Cascade Network

In [None]:
train_h1_graphs = []
train_h2_graphs = []
train_h3_graphs = []

cv_h1_graphs = []
cv_h2_graphs = []
cv_h3_graphs = []

test_h1_graphs = []
test_h2_graphs = []
test_h3_graphs = []

for df_idx, df in enumerate([cn_train_df, cn_cv_df, cn_test_df]):
    
    for row_idx, row in df.iterrows():
        
        graph_series = []
        
        h1_nodes = json.loads(row['h1_nodes'].replace("'", '"'))
        h1_edges = json.loads(row['h1_edges'].replace("'", '"'))
        h2_nodes = json.loads(row['h2_nodes'].replace("'", '"'))
        h2_edges = json.loads(row['h2_edges'].replace("'", '"'))
        h3_nodes = json.loads(row['h3_nodes'].replace("'", '"'))
        h3_edges = json.loads(row['h3_edges'].replace("'", '"'))
        
        for (nodes, edges) in [(h1_nodes, h1_edges), (h2_nodes, h2_edges), (h3_nodes, h3_edges)]:
    
            # do re-index & add nodes
            node_reindex_map = {}
            node_features = []
            current_id = 0
            for node_id in list(nodes.keys()):
                node_features.append(nodes[node_id])
                node_reindex_map[int(node_id)] = current_id
                current_id += 1

            # add edges
            nodes_from = []
            nodes_to = []
            for edge in edges:
                # edge: [user A, user A's follower]
                nodes_from.append(node_reindex_map[edge[0]])
                nodes_to.append(node_reindex_map[edge[1]])

            x = torch.tensor(node_features, dtype=torch.float)
            edge_index = torch.tensor([nodes_from, nodes_to], dtype=torch.long)

            graph = Data(x=x, edge_index=edge_index).to(device)
            graph_series.append(graph)
    
    
        if df_idx == 0:
            train_h1_graphs.append(graph_series[0])
            train_h2_graphs.append(graph_series[1])
            train_h3_graphs.append(graph_series[2])
        elif df_idx == 1:
            cv_h1_graphs.append(graph_series[0])
            cv_h2_graphs.append(graph_series[1])
            cv_h3_graphs.append(graph_series[2])
        else:
            test_h1_graphs.append(graph_series[0])
            test_h2_graphs.append(graph_series[1])
            test_h3_graphs.append(graph_series[2])

In [None]:
BATCH_SIZE = 25000

train_h1_dataset = DataLoader(train_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
train_h2_dataset = DataLoader(train_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
train_h3_dataset = DataLoader(train_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

cv_h1_dataset = DataLoader(cv_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
cv_h2_dataset = DataLoader(cv_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
cv_h3_dataset = DataLoader(cv_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

test_h1_dataset = DataLoader(test_h1_graphs, batch_size=BATCH_SIZE, shuffle=False)
test_h2_dataset = DataLoader(test_h2_graphs, batch_size=BATCH_SIZE, shuffle=False)
test_h3_dataset = DataLoader(test_h3_graphs, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
CN_NUMERIC_COLUMNS = ['max_deg', 'avg_deg', 'min_deg', 'max_timediff', 'min_timediff', 'avg_timediff', 'node_number', 'edge_number']

In [None]:
train_cascade_numeric_tensors = torch.tensor(cn_train_df[CN_NUMERIC_COLUMNS].values, dtype=torch.float).to(device)
cv_cascade_numeric_tensors = torch.tensor(cn_cv_df[CN_NUMERIC_COLUMNS].values, dtype=torch.float).to(device)
test_cascade_numeric_tensors = torch.tensor(cn_test_df[CN_NUMERIC_COLUMNS].values, dtype=torch.float).to(device)

print(train_cascade_numeric_tensors.shape)
print(cv_cascade_numeric_tensors.shape)
print(test_cascade_numeric_tensors.shape)

#### Labels

In [None]:
train_labels = torch.tensor(ut_train_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
cv_labels = torch.tensor(ut_cv_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)
test_labels = torch.tensor(ut_test_df['cascade_size'].values, dtype=torch.float).unsqueeze(1).to(device)

print(train_labels.shape)

## 4. Model

In [None]:
# attention network

class AttentionLayer(nn.Module):
    def __init__(self, feature_dim, drop_ratio=0):
        super(AttentionLayer, self).__init__()
        
        self.linear1 = nn.Linear(feature_dim, feature_dim)
        self.linear2 = nn.Linear(feature_dim, feature_dim)
        self.dropout = nn.Dropout(p = drop_ratio)
        
    def forward(self, x):
        
        # input shape: batch_size x feature_dim
        
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        weights = F.softmax(x, dim=1)

        return weights

In [None]:
class PopularityModel(torch.nn.Module):
    
    def __init__(self, node_dimension=8, numeric_dimension = (41+1+8+8+8)):
        
        super(PopularityModel, self).__init__()
        
        # dimensions setup
        self.node_dim = node_dimension
        self.numeric_dim = numeric_dimension
        self.rnn_hidden_dim = int(self.node_dim/2)
        self.mlp_dim = self.rnn_hidden_dim + self.numeric_dim
        
        # GCN layers
        self.g1_gconv = GCNConv(self.node_dim, self.node_dim)
        self.g2_gconv = GCNConv(self.node_dim, self.node_dim)
        self.g3_gconv = GCNConv(self.node_dim, self.node_dim)
        
        # RNN
        self.rnn = nn.GRUCell(self.node_dim, int(self.node_dim/2))
        
        self.attention = AttentionLayer(self.mlp_dim, drop_ratio=0)
        
        # final MLP layers
        self.linear1 = nn.Linear(self.mlp_dim, int(self.mlp_dim/2))
        self.linear2 = nn.Linear(int(self.mlp_dim/2), int(self.mlp_dim/4))
        self.linear3 = nn.Linear(int(self.mlp_dim/4), 1)
        
    def forward(self, graph1, graph2, graph3, numeric_features):
        
        batch_size = graph1.num_graphs
        
        g1_x, g1_edge_index = graph1.x, graph1.edge_index
        g2_x, g2_edge_index = graph2.x, graph2.edge_index
        g3_x, g3_edge_index = graph3.x, graph3.edge_index
        
        g1_x = F.relu(self.g1_gconv(g1_x, g1_edge_index)) # shape: (all node_number) * node_dimension
        g1_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph1.batch == g_idx).squeeze(1)
            g1_x_pooled[g_idx] = torch.sum(g1_x[indexs], 0)
            
        g2_x = F.relu(self.g2_gconv(g2_x, g2_edge_index)) # shape: (all node_number) * node_dimension
        g2_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph2.batch == g_idx).squeeze(1)
            g2_x_pooled[g_idx] = torch.sum(g2_x[indexs], 0)
            
        g3_x = F.relu(self.g3_gconv(g3_x, g3_edge_index)) # shape: (all node_number) * node_dimension
        g3_x_pooled = torch.zeros((batch_size, self.node_dim)).to(device) # shape: batch_size * node_dimension
        for g_idx in range(batch_size):
            indexs = torch.nonzero(graph3.batch == g_idx).squeeze(1)
            g3_x_pooled[g_idx] = torch.sum(g3_x[indexs], 0)
        
        hx = torch.randn((batch_size, self.rnn_hidden_dim), dtype=torch.float).to(device)
        hx = self.rnn(g1_x_pooled, hx)
        hx = self.rnn(g2_x_pooled, hx)
        hx = self.rnn(g3_x_pooled, hx)
        
        concatenated = torch.cat((hx, numeric_features), 1)
        
        attention_weights = self.attention(concatenated)
        
        x = concatenated * attention_weights

        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        y = F.relu(self.linear3(x))

        return (y, attention_weights)

## 5. Evaluation

In [None]:
def accuracy_at_k(predicted, labels, k = 10):
    
    # check whether both sizes are identical
    assert predicted.size(0) == labels.size(0)
    
    # sort the values in descending order and gets the indexs
    sorted_predicted_index = torch.argsort(predicted, descending = True)
    sorted_label_index = torch.argsort(labels, descending = True)
    
    k_number = max(int(predicted.size(0) * k / 100), 1)
    
    topk_predicted_index = sorted_predicted_index[:k_number]
    topk_label_index = sorted_label_index[:k_number]
    
    hit_count = 0
    for p in topk_predicted_index:
        if p in topk_label_index:
            hit_count += 1
            
    accuracy = hit_count/k_number
            
    return (accuracy, hit_count, k_number)

## 6. Training

In [None]:
LR_REGRESSION = 4e-3
EPOCH = 200
EARLY_STOP_PATIENCE = 2

train_post_numeric_tensors + train_news_numeric_tensors + train_up_numeric_tensors + train_ut_numeric_tensors + train_cascade_numeric_tensors

In [None]:
def train(regression_lr=LR_REGRESSION, max_epoch=EPOCH, early_stop_patience=EARLY_STOP_PATIENCE, verbose=True, manual_seed=None):
    
    if manual_seed:
        seed = manual_seed
    else:
        seed = torch.random.seed()
    
    torch.manual_seed(seed)
    popularity_model = PopularityModel(node_dimension=8, numeric_dimension = (41+1+8+8+8)).to(device)
    popularity_model.train()

    optimizer_regression = torch.optim.Adam(popularity_model.parameters(), lr = regression_lr)
    
    epoch_losses = []

    # cross validation for early stopping
    current_val_error = float('inf')
    val_error_inc_count = 0
    cv_losses = []
    
    for epoch in range(max_epoch):
    
        if verbose:
            print(epoch, end="")

        batch_losses = []
        
        train_h1_it = iter(train_h1_dataset)
        train_h2_it = iter(train_h2_dataset)
        train_h3_it = iter(train_h3_dataset)

        # batch training
        for i in range(0, train_labels.size(0), BATCH_SIZE):
            
            if verbose:
                print(".", end="")

            optimizer_regression.zero_grad()

            END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < train_labels.size(0) else train_labels.size(0)
            
            graph_h1, graph_h2, graph_h3 = next(train_h1_it), next(train_h2_it), next(train_h3_it)
            graph_h1.to(device)
            graph_h2.to(device)
            graph_h3.to(device)
            
            batch_numeric_features = torch.cat((
                train_post_numeric_tensors[i:END], # dim=41
                train_news_numeric_tensors[i:END], # dim=1
                train_up_numeric_tensors[i:END], # dim=8
                train_ut_numeric_tensors[i:END], # dim=8
                train_cascade_numeric_tensors[i:END] # dim=8
            ), 1)
            
            batch_labels = train_labels[i:END]

            # forward: GNN+RNN+Linear Regression
            predicted, attn_weights = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features)

            # compute loss (weighted mean squared error)
            loss = F.mse_loss(predicted, batch_labels, reduction='mean')

            # backward propagation
            loss.backward()
            optimizer_regression.step()

            batch_losses.append(loss)

        epoch_loss = torch.tensor(batch_losses).mean().item()
        if verbose:
            print(f"Loss: {epoch_loss:.4f}", end=",\n")
        
        if (epoch > 0) and (epoch_loss == epoch_losses[-1]):
            if verbose:
                print(f"early stopping triggered! stopped at epoch {epoch}")
                break
                
        epoch_losses.append(epoch_loss)
        
        with torch.no_grad():
        
            batch_losses = []
            
            cv_h1_it = iter(cv_h1_dataset)
            cv_h2_it = iter(cv_h2_dataset)
            cv_h3_it = iter(cv_h3_dataset)
            
            for i in range(0, cv_labels.size(0), BATCH_SIZE):

                END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < cv_labels.size(0) else cv_labels.size(0)
                
                graph_h1, graph_h2, graph_h3 = next(cv_h1_it), next(cv_h2_it), next(cv_h3_it)
                graph_h1.to(device)
                graph_h2.to(device)
                graph_h3.to(device)
                
                batch_numeric_features = torch.cat((
                    cv_post_numeric_tensors[i:END], # dim=41
                    cv_news_numeric_tensors[i:END], # dim=1
                    cv_up_numeric_tensors[i:END], # dim=8
                    cv_ut_numeric_tensors[i:END], # dim=8
                    cv_cascade_numeric_tensors[i:END] # dim=8
                ), 1)
                
                batch_labels = cv_labels[i:END]

                # forward: GNN+RNN+Linear Regression
                predicted, attn_weights = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features)

                # compute loss (weighted mean squared error)
                loss = F.mse_loss(predicted, batch_labels, reduction='mean')

                batch_losses.append(loss)

            cv_error = torch.tensor(batch_losses).mean().item()
            cv_losses.append(cv_error)

            if cv_error >= current_val_error:
                val_error_inc_count += 1
                current_val_error = cv_error
                if val_error_inc_count >= early_stop_patience:
                    if verbose:
                        print(f"early stopping triggered! stopped at epoch {epoch}")
                    break
            else:
                val_error_inc_count = 0
                current_val_error = cv_error
    
    # evaluation
    with torch.no_grad():
        
        test_h1_it = iter(test_h1_dataset)
        test_h2_it = iter(test_h2_dataset)
        test_h3_it = iter(test_h3_dataset)

        model_test_predicted = torch.zeros((test_labels.size(0),), dtype=torch.float).to(device)
        feature_attention_weights = torch.zeros((test_labels.size(0), popularity_model.mlp_dim), dtype=torch.float).to(device)
        for i in range(0, test_labels.size(0), BATCH_SIZE):

            END = (i + BATCH_SIZE) if (i + BATCH_SIZE) < test_labels.size(0) else test_labels.size(0)
            
            graph_h1, graph_h2, graph_h3 = next(test_h1_it), next(test_h2_it), next(test_h3_it)
            graph_h1.to(device)
            graph_h2.to(device)
            graph_h3.to(device)
            
            batch_numeric_features = torch.cat((
                test_post_numeric_tensors[i:END], # dim=41
                test_news_numeric_tensors[i:END], # dim=1
                test_up_numeric_tensors[i:END], # dim=8
                test_ut_numeric_tensors[i:END], # dim=8
                test_cascade_numeric_tensors[i:END] # dim=8
            ), 1)
            batch_labels = test_labels[i:END]

            # forward: GNN+RNN+Linear Regression
            predicted, attn_weights = popularity_model(graph_h1, graph_h2, graph_h3, batch_numeric_features)
            model_test_predicted[i:END] = predicted.squeeze(1)
            feature_attention_weights[i:END] = attn_weights
            
        # get average attention weights for every feature
        avg_feature_importance = torch.mean(feature_attention_weights, 0)
        print("Averaged Feature Importance:")
        print(avg_feature_importance)

        testset_size = test_labels.size(0)
        
        # record attention weights individually
        importance_records = []
        for idx in range(testset_size):
            label = test_labels[idx].item()
            if label > 95:
                importance_records.append({
                    'idx': idx,
                    'label': label,
                    'attention_weights': feature_attention_weights[idx].tolist()
                })
        with open(f"./records/attention_weights_{seed}", 'w') as f:
            f.write(json.dumps(importance_records))

        model_mae_scores = F.l1_loss(model_test_predicted, test_labels)
        model_mse_scores = F.mse_loss(model_test_predicted, test_labels)

        hit_rate_top1p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 1)
        hit_rate_top5p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 5)
        hit_rate_top10p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 10)
        hit_rate_top15p = accuracy_at_k(model_test_predicted, test_labels.squeeze(1), 15)

        ndcg_score_1p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 1 / 100))
        ndcg_score_5p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 5 / 100))
        ndcg_score_10p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 10 / 100))
        ndcg_score_15p = sklearn.metrics.ndcg_score(test_labels.reshape((1, -1)).cpu(), model_test_predicted.unsqueeze(0).cpu(), k=int(testset_size * 15 / 100))

        if verbose:
            print(f"seed: {seed}")
            print(f"MAE: {model_mae_scores.item()}")
            print(f"MSE: {model_mse_scores.item()}")
            print(f"Hit Rate@1%: {hit_rate_top1p}")
            print(f"Hit Rate@5%: {hit_rate_top5p}")
            print(f"Hit Rate@10%: {hit_rate_top10p}")
            print(f"Hit Rate@15%: {hit_rate_top15p}")
            print(f"NDCG@1%: {ndcg_score_1p}")
            print(f"NDCG@5%: {ndcg_score_5p}")
            print(f"NDCG@10%: {ndcg_score_10p}")
            print(f"NDCG@15%: {ndcg_score_15p}")
            
            plt.plot(epoch_losses, label = 'training')
            plt.plot(cv_losses, label = 'validation')
            plt.xlabel('epoch'), plt.ylabel('MSE')
            plt.legend()
            plt.show()
            
        # clear useless CUDA memory
        popularity_model = None
        optimizer_regression = None
        graph_h1, graph_h2, graph_h3 = None, None, None
        batch_numeric_features = None
        batch_labels = None
        predicted = None
        loss = None

        torch.cuda.empty_cache()
        
        return {
            'seed': seed,
            'mae': model_mae_scores.item(),
            'mse': model_mse_scores.item(),
            'hr1p': hit_rate_top1p[0],
            'hr5p': hit_rate_top5p[0],
            'hr10p': hit_rate_top10p[0],
            'hr15p': hit_rate_top15p[0],
            'ndcg1p': ndcg_score_1p,
            'ndcg5p': ndcg_score_5p,
            'ndcg10p': ndcg_score_10p,
            'ndcg15p': ndcg_score_15p
        };

In [None]:
MODELS_PER_SETTING = 10
results = []
for i in range(MODELS_PER_SETTING):
    print(f"model {i+1} - {datetime.now()}")

    res = train(regression_lr=4e-3, max_epoch=200, early_stop_patience=2, verbose=True, manual_seed=None)
    res['number'] = i
    res['score'] = res['hr1p'] + res['hr5p'] + res['hr10p'] + res['hr15p'] + res['ndcg1p'] + res['ndcg5p'] + res['ndcg10p'] + res['ndcg15p']
    print(f"score: {res['score']}")
    results.append(res)
    print(f"model {i+1} done - {datetime.now()} \n")