In [1]:
import pandas as pd
import pickle
from tqdm.notebook import tqdm

import os
import shutil

from libs.NetworkBuilder import NetworkBuilder
from libs.ScisciDataset import ScisciDataset

In [2]:
df = pd.read_pickle('../../data/dataset.p')

In [4]:
ref_df = pd.read_json('../../data/references.json')

In [5]:
class AblatedDataset:
    
    def __init__(self, df, ref_df):
        
        nb = NetworkBuilder(df, references=ref_df)
        self.nb = nb
        X,LE = nb.prepare_metadata(remove=None)
        self.edge_index,self.eid_row_indices = nb.build_edges()
        self.variations = ["affiliation", "first_author", "coverDate", "subject_area", "topic_dist", "bert", "citations_at", None]
        

    def save_dataset(self, dataset, file_name):
        with open(file_name, 'wb') as handle:
            pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
            
    def generate_dataset(self):
        
        for feature in tqdm(self.variations, desc = "generating dataset"):
            X,LE = self.nb.prepare_metadata(remove=feature)
            if os.path.exists('data/{}'.format(feature)):
                shutil.rmtree('data/{}'.format(feature))
            os.mkdir('data/{}'.format(feature))
            
            self.save_dataset(self.edge_index, 'data/{}/edge_pairs.p'.format(feature))
            self.save_dataset(X, 'data/{}/x.p'.format(feature))
            
            print("dataset without {} saved".format(feature))
            print("shape: {}".format(X.shape))
        




In [6]:
ablated_data = AblatedDataset(df, ref_df)

In [7]:
ablated_data.generate_dataset()

HBox(children=(FloatProgress(value=0.0, description='generating dataset', max=8.0, style=ProgressStyle(descrip…

dataset without affiliation saved
shape: torch.Size([22151, 830])
dataset without first_author saved
shape: torch.Size([22151, 830])
dataset without coverDate saved
shape: torch.Size([22151, 830])
dataset without subject_area saved
shape: torch.Size([22151, 802])
dataset without topic_dist saved
shape: torch.Size([22151, 811])
dataset without bert saved
shape: torch.Size([22151, 63])
dataset without citations_at saved
shape: torch.Size([22151, 820])
dataset without None saved
shape: torch.Size([22151, 831])

