In [1]:
import re
import copy
import logging
import collections
import numpy as np
import torch

def size_repr(key, item, indent=0):
    indent_str = ' ' * indent
    if torch.is_tensor(item) and item.dim() == 0:
        out = item.item()
    elif torch.is_tensor(item):
        out = str(list(item.size()))
#     elif isinstance(item, SparseTensor):
#         out = str(item.sizes())[:-1] + f', nnz={item.nnz()}]'
    elif isinstance(item, list) or isinstance(item, tuple):
        out = str([len(item)])
    elif isinstance(item, dict):
        lines = [indent_str + size_repr(k, v, 2) for k, v in item.items()]
        out = '{\n' + ',\n'.join(lines) + '\n' + indent_str + '}'
    elif isinstance(item, str):
        out = f'"{item}"'
    else:
        out = str(item)

    return f'{indent_str}{key}={out}'

class Data(object):
    def __init__(self, x=None, edge_index=None, edge_attr=None, y=None,
                 pos=None, normal=None, face=None, **kwargs):
        self.x = x
        self.edge_index = edge_index
        self.edge_attr = edge_attr
        self.y = y
        self.pos = pos
        self.normal = normal
        self.face = face
        for key, item in kwargs.items():
            if key == 'num_nodes':
                self.__num_nodes__ = item
            else:
                self[key] = item
                
    
    @classmethod
    def from_dict(cls, dictionary):
        r"""Creates a data object from a python dictionary."""
        data = cls()

        for key, item in dictionary.items():
            data[key] = item

        if torch_geometric.is_debug_enabled():
            data.debug()

        return data


    def to_dict(self):
        return {key: item for key, item in self}

    def to_namedtuple(self):
        keys = self.keys
        DataTuple = collections.namedtuple('DataTuple', keys)
        return DataTuple(*[self[key] for key in keys])

    def __getitem__(self, key):
        r"""Gets the data of the attribute :obj:`key`."""
        return getattr(self, key, None)


    def __setitem__(self, key, value):
        """Sets the attribute :obj:`key` to :obj:`value`."""
        setattr(self, key, value)


    @property
    def keys(self):
        r"""Returns all names of graph attributes."""
        keys = [key for key in self.__dict__.keys() if self[key] is not None]
        keys = [key for key in keys if key[:2] != '__' and key[-2:] != '__']
        return keys

    def __len__(self):
        r"""Returns the number of all present attributes."""
        return len(self.keys)

    def __contains__(self, key):
        r"""Returns :obj:`True`, if the attribute :obj:`key` is present in the
        data."""
        return key in self.keys


    def __iter__(self):
        r"""Iterates over all present attributes in the data, yielding their
        attribute names and content."""
        for key in sorted(self.keys):
            yield key, self[key]

    def __call__(self, *keys):
        r"""Iterates over all attributes :obj:`*keys` in the data, yielding
        their attribute names and content.
        If :obj:`*keys` is not given this method will iterative over all
        present attributes."""
        for key in sorted(self.keys) if not keys else keys:
            if key in self:
                yield key, self[key]


    def __cat_dim__(self, key, value):
        r"""Returns the dimension for which :obj:`value` of attribute
        :obj:`key` will get concatenated when creating batches.

        .. note::

            This method is for internal use only, and should only be overridden
            if the batch concatenation process is corrupted for a specific data
            attribute.
        """
        # Concatenate `*index*` and `*face*` attributes in the last dimension.
        if bool(re.search('(index|face)', key)):
            return -1
        # By default, concatenate sparse matrices diagonally.
        elif isinstance(value, SparseTensor):
            return (0, 1)
        return 0
    
    def __repr__(self):
        cls = str(self.__class__.__name__)
        has_dict = any([isinstance(item, dict) for _, item in self])

        if not has_dict:
            info = [size_repr(key, item) for key, item in self]
            return '{}({})'.format(cls, ', '.join(info))
        else:
            info = [size_repr(key, item, indent=2) for key, item in self]
            return '{}(\n{}\n)'.format(cls, ',\n'.join(info))
    
    def __apply__(self, item, func):
        if torch.is_tensor(item):
            return func(item)
        elif isinstance(item, SparseTensor):
            # Not all apply methods are supported for `SparseTensor`, e.g.,
            # `contiguous()`. We can get around it by capturing the exception.
            try:
                return func(item)
            except AttributeError:
                return item
        elif isinstance(item, (tuple, list)):
            return [self.__apply__(v, func) for v in item]
        elif isinstance(item, dict):
            return {k: self.__apply__(v, func) for k, v in item.items()}
        else:
            return item

    def apply(self, func, *keys):
        r"""Applies the function :obj:`func` to all tensor attributes
        :obj:`*keys`. If :obj:`*keys` is not given, :obj:`func` is applied to
        all present attributes.
        """
        for key, item in self(*keys):
            self[key] = self.__apply__(item, func)
        return self

    def contiguous(self, *keys):
        r"""Ensures a contiguous memory layout for all attributes :obj:`*keys`.
        If :obj:`*keys` is not given, all present attributes are ensured to
        have a contiguous memory layout."""
        return self.apply(lambda x: x.contiguous(), *keys)


    def to(self, device, *keys, **kwargs):
        r"""Performs tensor dtype and/or device conversion to all attributes
        :obj:`*keys`.
        If :obj:`*keys` is not given, the conversion is applied to all present
        attributes."""
        return self.apply(lambda x: x.to(device, **kwargs), *keys)


    def clone(self):
        return self.__class__.from_dict({
            k: v.clone() if torch.is_tensor(v) else copy.deepcopy(v)
            for k, v in self.__dict__.items()
        })

In [2]:
def getDataset(DIR):
    import pickle
    import pandas as pd
    
    data= pickle.load(open(DIR+'NVD_data', "rb" ))
    df_CVE_merged=pd.read_csv(DIR+'NVD_CVE.csv',low_memory=False)
    df_CWE=pd.read_csv(DIR+'NVD_CWE.csv',low_memory=False)

    return data, df_CVE_merged, df_CWE

#data, df_CVE_merged, df_CWE = getDataset('./NVD/')

In [19]:
def getRandomDataset(DIR, train_percent=0.70, validation_percent=0.10):    
    data, df_CVE_merged, df_CWE = getDataset(DIR)
    labels=data.y
    NUM_CLASSES=len(data.y[0])
    
    data_all_mask=data.train_mask|data.val_mask|data.test_mask
    all_mask= (data_all_mask == True).nonzero().flatten().numpy()
    
    CWEs_data={}
    
    for key in all_mask:
        row=labels[key]
        cwes=np.where(row == 1)[0]
        for cwe in cwes:
            if cwe in CWEs_data:
                CWEs_data[cwe].append(key)
            else:
                CWEs_data[cwe]=[key]

#     for key in range(NUM_CLASSES): 
#         print(key,end='->')
#         if key in CWEs_data:
#             print(len(CWEs_data[key]))
#         else:
#             print(0)
            
    test_percent=1.0-train_percent-validation_percent
    
    train_set=[]
    validation_set=[]
    test_set=[]
    
    for key in range(NUM_CLASSES): 
        if key in CWEs_data:
            count=len(CWEs_data[key])
            caselist=CWEs_data[key]
            np.random.shuffle(caselist)            
            
            train_len=int(count*train_percent)
            val_len=int(count*validation_percent)
            
            train_set.extend(caselist[:train_len])
            validation_set.extend(caselist[train_len:train_len+val_len])
            test_set.extend(caselist[train_len+val_len:])
    
#     print("Train Size: ",len(train_set))
#     print("Val Size: ", len(validation_set))
#     print("Test Size: ", len(test_set))
            
    data.train_mask[:]=False
    data.val_mask[:]=False    
    data.test_mask[:]=False 
    
    data.train_mask[train_set]=True
    data.val_mask[validation_set]=True
    data.test_mask[test_set]=True
    
    return data, df_CVE_merged, df_CWE

#data, df_CVE_merged, df_CWE = getRandomDataset('./Dataset/NVD/processed/')

In [4]:
# type(data)
# new_data=Data()
# for key in data.keys:
#     new_data[key]=data[key]
# import pickle
# pickle.dump(new_data,open('./NVD/NVD_data', "wb" ))

In [8]:
def getDummyDataset():
    
    import torch

    sentences=[
        "Human machine interface for lab abc computer applications",
        "A survey of user opinion of computer system response time",
        "The EPS user interface management system",
        "System anad human system engineering testing of EPS",
        "Relation of user perceived response time to error measurement",
        "The generation of random binary unordered trees",
        "The intersection graph of paths in trees",
        "Graph minors IV Widths of trees and well quasi ordering",
        "Graph minors A survey",
        "The quick brown fox jumps over the lazy dog",
        "it's been raining cats and dogs",
        "I don't know what to think any more",
        "I love animals"]

    class_sentences=[
        "Human machine interaction of computer",
        "Graph tress algorithm testing",
        "EPS user are management",
        "Animal dogs are animals too"
    ]

    labels=torch.Tensor([
        [1,0,0,0],
        [1,0,0,0],
        [0,0,1,0],
        [0,0,1,0],
        [1,0,0,0],
        [0,1,0,0],
        [0,1,0,0],
        [0,1,0,0],
        [0,1,0,0],
        [0,0,0,1],
        [0,0,0,1],
        [0,0,0,1],
        [0,0,0,1]
    ])

    class_labels=torch.Tensor([
        [1,0,0,0],
        [0,1,0,0],
        [0,0,1,0],
        [0,0,0,1]
    ])

    train_mask=torch.tensor([1,1,1,1,0,0,0,1,1,0,0,1,1],dtype=bool)
    #train_mask=torch.ones(len(sentences),dtype=bool)
    val_mask=torch.ones(len(sentences),dtype=bool)
    test_mask=torch.ones(len(sentences),dtype=bool)

    data=Data(train_mask=train_mask,val_mask=val_mask,test_mask=test_mask,y=labels)

    # data.child_parent={0:[1,3],1:[-1],2:[-1],3:[1,2]}
    # data.parent_child={-1:[1,2],1:[0,3],2:[3],3:[0]}
    # data.depth={0:[1,2],1:[0],2:[0],3:[1,1]}

    data.child_parent={0:[1],1:[-1],2:[-1],3:[-1]}
    data.parent_child={-1:[1,2,3],1:[0]}
    data.depth={0:1,1:0,2:0,3:0}

    sentences.extend(class_sentences)
    data.y=torch.cat((data.y,class_labels),dim=0).type(torch.long)
    labels=data.y

    class_mask=torch.cat((torch.zeros(len(data.train_mask),dtype=bool),torch.ones(len(class_labels),dtype=bool)),dim=0)
    data.class_mask=class_mask

    data.train_mask=torch.cat((data.train_mask,torch.zeros(len(class_labels),dtype=bool)),dim=0)
    data.val_mask=torch.cat((data.val_mask,torch.zeros(len(class_labels),dtype=bool)),dim=0)
    data.test_mask=torch.cat((data.test_mask,torch.zeros(len(class_labels),dtype=bool)),dim=0)

    labels=labels.float()
    #labels=torch.argmax(data.y,dim=1)
    
    return data, sentences, labels

#data, sentences, labels=getDummyDataset()

['Human machine interface for lab abc computer applications',
 'A survey of user opinion of computer system response time',
 'The EPS user interface management system',
 'System anad human system engineering testing of EPS',
 'Relation of user perceived response time to error measurement',
 'The generation of random binary unordered trees',
 'The intersection graph of paths in trees',
 'Graph minors IV Widths of trees and well quasi ordering',
 'Graph minors A survey',
 'The quick brown fox jumps over the lazy dog',
 "it's been raining cats and dogs",
 "I don't know what to think any more",
 'I love animals',
 'Human machine interaction of computer',
 'Graph tress algorithm testing',
 'EPS user are management',
 'Animal dogs are animals too']