In [107]:
%load_ext autoreload
%autoreload 2

In [108]:
import json
import pandas as pd

In [112]:
import sys
sys.path.append('../utils')

from print_utils import print_json_tree

In [117]:
# load the data at ../data/Dataset.json
with open('../data/Dataset.json') as f:
    data = json.load(f)

# print tree of first 2 levels
print(f'Data tree for first 2 levels:')
print_json_tree(data, max_depth=2)

Data tree for first 2 levels:
- status
- message
  - version
  - utc_age
  - content


In [114]:
dataset = pd.DataFrame(data['message']['content'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   doc_type                    285 non-null    object
 1   criticality_tag             285 non-null    object
 2   knowledge_type              285 non-null    object
 3   requirement_list            285 non-null    object
 4   title_list                  285 non-null    object
 5   location_list               285 non-null    object
 6   description_list            281 non-null    object
 7   internal_rating_list        284 non-null    object
 8   internal_ratingsource_list  285 non-null    object
 9   cvss_rating_list            229 non-null    object
 10  rule_list                   285 non-null    object
 11  cwe_id_list                 195 non-null    object
 12  cve_id_list                 253 non-null    object
 13  activity_list               285 non-null    object

In [18]:
dataset.iloc[0]

doc_type                                                                summary
criticality_tag                                               [unrestricted, 0]
knowledge_type                                                          derived
requirement_list                                         [zFJvGo8BtW9hkJoz7Vo0]
title_list                    [{'element': 'use-after-free caused by do_subm...
location_list                 [{'location': '{'file': 'linux-libc-dev', 'res...
description_list              [{'element': 'A use-after-free vulnerability w...
internal_rating_list                 [{'element': 'MEDIUM', 'source': 'Trivy'}]
internal_ratingsource_list              [{'element': 'nvd', 'source': 'Trivy'}]
cvss_rating_list              [{'element': '[{'score': 5.5, 'version': 'V3',...
rule_list                     [{'element': 'CVE-2023-4132', 'source': 'Trivy'}]
cwe_id_list                       [{'element': ['CWE-416'], 'source': 'Trivy'}]
cve_id_list                   [{'element

In [119]:
from datetime import datetime

def extract_combine(lst, location=None):
    combined = ""
    for item in lst:
        if location:
            combined += item['location'] + f" (Amount: {item['amount']}, Source: {item['source']})"
        else:
            combined += item['element'] + f" ({item['source']})"
    return combined.strip()
def get_latest_state(lst):
    for item in lst:
        return max(item['tags'], key=lambda x: datetime.fromisoformat(x['created_at']))['action']

transformed_content = []
for entry in data['message']['content']:
    if 'description_list' in entry:
        transformed_entry = {
            'title': extract_combine(entry['title_list']),
            'location': extract_combine(entry['location_list'], 'location'),
            'last_state': get_latest_state(entry['location_list']),
            'description': extract_combine(entry['description_list'])
        }
        transformed_content.append(transformed_entry)
        
with open('summarised_findings.json', 'w') as json_file:
    json.dump(transformed_content, json_file, indent=4)