In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# hyperparameters
seed = 42
valid_size=0.3 # validation + testing
test_size=0.5 # tesing / (testing+validation)
learning_rate = 0.001
epochs = 10
batch_size=32

random.seed(seed)
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device=torch.device('cpu')
print(device)

cuda


In [278]:
# load data
df_cascade = pd.read_json("../processed_data/df_cascade.json")
df_authors = pd.read_json("../processed_data/df_authors.json")

sum_positive = 0
for id, row in df_cascade.iterrows():
    sum_positive += len(row['citation_authors'])
avg_negative = sum_positive // len(df_cascade)

In [279]:
# sentence-bert to get embeddings

# from sentence_transformers import SentenceTransformer
# model = SentenceTransformer("all-MiniLM-L6-v2")

# abstract_ls = df_abstract['abstract'].tolist()
# for i in range(len(abstract_ls)):
#     if type(abstract_ls[i]) == float:
#         abstract_ls[i] = ""

# abstract_embeddings = model.encode(abstract_ls, convert_to_tensor=True)
# torch.save(abstract_embeddings, 'abstract_embeddings.pt')

loaded_tensor = torch.load('../processed_data/abstract_embeddings.pt').to(device)

In [280]:
# add negative samples
df_cascade['negative_authors'] = [[] for _ in range(len(df_cascade))]
author_ls = df_authors['author_id'].tolist()
for index, row in df_cascade.iterrows():
    authors = row['citation_authors']
    temp_indices = []
    for _ in range(avg_negative):
        isInclude = True
        while(isInclude):
            rand_author_id = random.choice(author_ls)
            if rand_author_id in authors:
                continue
            else:
                isInclude = False
                temp_indices.append(rand_author_id)
    df_cascade.at[index, 'negative_authors'] = temp_indices


In [281]:
# split

train_data, temp_data = train_test_split(df_cascade, test_size=valid_size, random_state=seed)
val_data, test_data = train_test_split(temp_data, test_size=test_size, random_state=seed)

train_iter = []
for index, row in train_data.iterrows():
    paper_id = row['paper_id']
    for author in row['citation_authors']:
        train_iter.append([paper_id, int(author), 1])
    for negative in row['negative_authors']:
        train_iter.append([paper_id, int(negative), 0])

val_iter = []
for index, row in val_data.iterrows():
    paper_id = row['paper_id']
    for author in row['citation_authors']:
        val_iter.append([paper_id, int(author), 1])
    for negative in row['negative_authors']:
        val_iter.append([paper_id, int(negative), 0])

test_iter = []
for index, row in test_data.iterrows():
    paper_id = row['paper_id']
    for author in row['citation_authors']:
        test_iter.append([paper_id, int(author), 1])
    for negative in row['negative_authors']:
        test_iter.append([paper_id, int(negative), 0])

In [282]:
# data loader

class PairDataset(Dataset):
  def __init__(self, x):
    self.x_paper = np.array(x)[:,0]
    self.x_author = np.array(x)[:,1]
    self.y = np.array(x)[:,2]
  
  def __len__(self):
    return len(self.x_paper)

  def __getitem__(self, idx):
    x1_sample = df_abstract['abstract'][self.x_paper[idx]] # abstract of paper
    x2_sample = df_authors.loc[df_authors['author_id'] == self.x_author[idx], 'paper_ids']
    x2_sample = x2_sample.values[0] # list of paper ids
    y_sample = self.y[idx]
    
    return x1_sample, x2_sample, y_sample

train_data = PairDataset(train_iter)
valid_data = PairDataset(val_iter)
test_data = PairDataset(test_iter)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [283]:
idx = 0
x = train_iter
x_paper = np.array(x)[:,0]
x_author = np.array(x)[:,1]
x1_sample = df_abstract['abstract'][x_paper[idx]] # abstract of paper
x2_sample = df_authors.loc[df_authors['author_id'] == x_author[idx], 'paper_ids']
x2_sample = x2_sample.values[0]
print(type(x2_sample))
y_sample = y[idx]

<class 'list'>


In [286]:
for paper_id, author_id_ls, label in train_data:
    print(paper_id, author_id_ls, label)
    break

Freeblock scheduling is a new approach to utilizing more of a disk's potential media bandwidth. By filling rotational latency periods with useful media transfers, 20-50% of a never-idle disk's bandwidth can often be provided to background applications with no effect on foreground response times. This paper describes freeblock scheduling and demonstrates its value with simulation studies of two concrete applications: segment cleaning and data mining. Free segment cleaning often allows an LFS file system to maintain its ideal write performance when cleaning overheads would otherwise reduce performance by up to a factor of three. Free data mining can achieve over 47 full disk scans per day on an active transaction processing system, with no effect on its disk performance. [419613, 435686, 423309, 420607, 401526] 1


In [284]:
for paper_id, author_id_ls, label in train_loader:
    print(paper_id, author_id_ls, label)
    break

RuntimeError: each element in list of batch should be of equal size

In [9]:
# model

class FcClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, rnn_hidden_size, num_fc_layers_x2=1):
        super(FcClassifier, self).__init__()
        self.fc_x1 = nn.Sequential(
            nn.Linear(embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        self.fc_x2 = nn.ModuleList([nn.Linear(embedding_dim, rnn_hidden_size) for _ in range(num_fc_layers_x2)])
        self.rnn = nn.LSTM(rnn_hidden_size, rnn_hidden_size, batch_first=True)
        self.fc_output = nn.Linear(hidden_dim + rnn_hidden_size, hidden_dim)

    def forward(self, x1_idx, x2_idxs, sentence_embeddings):
        # Retrieve sentence embeddings
        x1_embedding = sentence_embeddings[x1_idx]
        x2_embeddings = [sentence_embeddings[idx] for idx in x2_idxs]

        # Process x1 embedding
        y1 = self.fc_x1(x1_embedding)

        # Process x2 embeddings
        x2_representations = [fc(emb) for fc, emb in zip(self.fc_x2, x2_embeddings)]
        x2_packed = nn.utils.rnn.pack_sequence(x2_representations, lengths=[len(rep) for rep in x2_representations])
        _, (last_hidden, _) = self.rnn(x2_packed)
        y2 = last_hidden.squeeze(0)

        # Concatenate y1 and y2
        final_output = self.fc_output(torch.cat([y1, y2], dim=-1))

        return final_output

In [None]:
# training loop

for epoch in range(epochs):
    model.train()
    for paper_id, author_id_ls, label in train_loader:
        optimizer.zero_grad()
        
        