In [1]:
import pandas as pd
import pickle

from libs.NetworkBuilder import NetworkBuilder
from libs.ScisciDataset import ScisciDataset

In [2]:
df = pd.read_pickle('../../data/dataset.p')

In [3]:
ref_df = pd.read_json('../../data/references.json')

In [4]:
def save_dataset(dataset, file_name):
    with open(file_name, 'wb') as handle:
        pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)


# TF-IDF

In [4]:
nb = NetworkBuilder(df, references=ref_df)

In [5]:
X,LE = nb.prepare_metadata(text_encoding = 'tfidf')

In [9]:
edge_index,eid_row_indices = nb.build_edges()

In [9]:
save_dataset(edge_index, '../../data/torch/tfidf/edge_pairs.p')
save_dataset(X, '../../data/torch/tfidf/x.p')
save_dataset(eid_row_indices, '../../data/torch/tfidf/eid_row_indices.p')
save_dataset(LE, '../../data/torch/tfidf/LE.p')

In [None]:
x_tfidf = pd.read_pickle('../../data/torch/tfidf/x.p')

## D2V

In [5]:
df = pd.read_pickle('../../data/dataset.p')

In [6]:
nb = NetworkBuilder(df, references=ref_df)

In [7]:
X,LE = nb.prepare_metadata(text_encoding = 'd2v')

In [10]:
edge_index,eid_row_indices = nb.build_edges()

In [13]:
def filter_references(corpus_eid, references):
    #remove references without ID
    ref_df = references.copy()
    ref_df = ref_df[ref_df.eid_of_ref.notnull()].reset_index(drop=True)
    # add the '2-s2.0-' of the eid_of_ref
    ref_df['eid_of_ref'] = ref_df['eid_of_ref'].apply(lambda x: '2-s2.0-' + str(int(x)))
    #filter for within corpus eids
    ref_df = ref_df[ref_df['eid_of_ref'].isin(corpus_eid)]
    ref_df = ref_df[ref_df['eid'].isin(corpus_eid)]
    return ref_df

In [18]:
filter_ref_df = filter_references(df.eid.unique(),ref_df)

In [29]:
network_ids = set(filter_ref_df.eid_of_ref).union(set(filter_ref_df.eid))

In [33]:
with open('../../results/network_ids.txt', 'w') as f:
    for item in list(network_ids):
        f.write("%s\n" % item)

In [121]:
save_dataset(edge_index, '../../data/torch/d2v/edge_pairs.p')
save_dataset(X, '../../data/torch/d2v/x.p')
save_dataset(eid_row_indices, '../../data/torch/d2v/eid_row_indices.p')
save_dataset(LE, '../../data/torch/d2v/LE.p')

In [11]:
x_d2v = pd.read_pickle('../../data/torch/d2v/x.p')

In [12]:
x_d2v.shape

torch.Size([22151, 463])

In [20]:
# dataset
dataset = ScisciDataset(root='test',raw_path = '../../data/torch/d2v')
data = dataset[0]
#data = train_test_split_edges(data)

Processing...
Done!


In [21]:
data

Data(edge_index=[2, 68797], x=[22151, 263])

# BERT

In [6]:
nb = NetworkBuilder(df, references=ref_df)

In [7]:
X,LE = nb.prepare_metadata(text_encoding = 'bert')

In [8]:
edge_index,eid_row_indices = nb.build_edges()

In [9]:
X.shape

torch.Size([22151, 831])

In [10]:
save_dataset(edge_index, '../../data/torch/bert/edge_pairs.p')
save_dataset(X, '../../data/torch/bert/x.p')
save_dataset(eid_row_indices, '../../data/torch/bert/eid_row_indices.p')
save_dataset(LE, '../../data/torch/bert/LE.p')